def main(_): """ Runs {FLAGS.num_episodes} games of Hanabi-small using a trained ppo agent""" # --- Load ppo agent with trained policy --- # agent = TrainedPPOAgent(train_dir=FLAGS.root_dir) # --- Create Environment for evaluation --- # env = rl_env.make(environment_name=FLAGS.game_type, num_players=FLAGS.num_players) eval_env = PyhanabiEnvWrapper(env) # --- Run single game for inspection --- # for i in range(FLAGS.num_episodes): turn = 1 sum_rewards = 0 time_step = eval_env.reset() while not time_step.is_last(): # act action = agent.act(time_step) # step time_step = eval_env.step(action) if time_step.reward > 0: print(f'Got reward {time_step.reward} at turn {turn}') sum_rewards += time_step.reward turn += 1 if time_step.is_last(): print(f'Game {i+1} ended at turn {turn-1}') print(f'total reward was {sum_rewards}')
def main(_): experiment_logger = logger.Logger(f'{FLAGS.root_dir}') env = rl_env.make(environment_name=FLAGS.game_type, num_players=FLAGS.num_players) obs_stacker = run_experiment.create_obs_stacker(environment=env, history_size=1) agent = rainbow_agent.RainbowAgent( observation_size=obs_stacker.observation_size(), num_actions=env.num_moves(), num_players=env.players) # reload checkpoint if possible start_iter, checkpointer = run_experiment.initialize_checkpointing( agent=agent, experiment_logger=experiment_logger, checkpoint_dir=f'{FLAGS.base_dir}/checkpoints') run_experiment.run_experiment( agent=agent, environment=env, start_iteration=start_iter, obs_stacker=obs_stacker, experiment_logger=experiment_logger, experiment_checkpointer=checkpointer, checkpoint_dir=f'{FLAGS.base_dir}/checkpoints', summary_dir=f'{FLAGS.base_dir}/summary', )
def __init__(self, train_dir): # --- Tf session --- # tf.reset_default_graph() self.sess = tf.Session() tf.compat.v1.enable_resource_variables() # --- Environment Stub--- # env = rl_env.make(environment_name=FLAGS.game_type, num_players=FLAGS.num_players) wrapped_env = PyhanabiEnvWrapper(env) tf_env = tf_py_environment.TFPyEnvironment(wrapped_env) with self.sess.as_default(): # --- Init Networks --- # actor_net, value_net = load_networks(tf_env) # --- Init agent --- # agent = PPOAgent( # set up ppo agent with tf_agents tf_env.time_step_spec(), tf_env.action_spec(), actor_net=actor_net, value_net=value_net, train_step_counter=tf.compat.v1.train. get_or_create_global_step(), normalize_observations=False) # --- init policy --- # self.policy = py_tf_policy.PyTFPolicy(agent.policy) self.policy.initialize(None) # --- restore from checkpoint --- # self.policy.restore(policy_dir=train_dir, assert_consumed=False) # Run tf graph self.sess.run(agent.initialize())
def __init__(self, flags): """Initialize runner.""" self.flags = flags self.agent_config = {'players': flags['players']} self.environment = rl_env.make('Hanabi-Full', num_players=flags['players']) self.agent_class = AGENT_CLASSES[flags['agent_class']]
def __init__(self, root_dir, game_type='Hanabi-Full', num_players=4, actor_fc_layers=(100, ), value_fc_layers=(100, ), use_value_network=False ): tf.reset_default_graph() self.sess = tf.Session() tf.compat.v1.enable_resource_variables() pyhanabi_env = rl_env.make(environment_name=game_type, num_players=num_players) py_env = PyhanabiEnvWrapper(pyhanabi_env) tf_env = tf_py_environment.TFPyEnvironment(py_env) with self.sess.as_default(): # init the agent actor_net = masked_networks.MaskedActorDistributionNetwork( tf_env.observation_spec(), tf_env.action_spec(), fc_layer_params=actor_fc_layers ) value_network = None if use_value_network: value_network = MaskedValueNetwork( tf_env.observation_spec(), fc_layer_params=value_fc_layers ) global_step = tf.compat.v1.train.get_or_create_global_step() # necessary ??? => Yes baby tf_agent = reinforce_agent.ReinforceAgent( tf_env.time_step_spec(), tf_env.action_spec(), actor_network=actor_net, value_network=value_network if use_value_network else None, value_estimation_loss_coef=.2, gamma=.9, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=1e-3), debug_summaries=False, summarize_grads_and_vars=False, train_step_counter=global_step ) self.policy = py_tf_policy.PyTFPolicy(tf_agent.policy) # load checkpoint #train_dir = os.path.join(root_dir, 'train') self.policy.initialize(None) self.policy.restore(root_dir) init_agent_op = tf_agent.initialize() self.sess.run(init_agent_op)
def __init__(self, flags): """Initialize runner.""" self.flags = flags self.agent_config = { 'players': flags['players'], 'alpha': flags['alpha'], 'gamma': flags['gamma'] } self.environment = rl_env.make('Hanabi-My-Small', num_players=flags['players'], seed=flags['seed']) self.agent_class = AGENT_CLASSES[flags['agent_class']]
def __init__(self, flags): """Initialize runner""" self.flags = flags self.environment = rl_env.make('Hanabi-Very-Small', num_players=self.flags['players']) self.agent_config = { "players": flags['players'], 'state_size': self.environment.vectorized_observation_shape()[0], 'action_size': self.environment.num_moves(), 'env': self.environment } self.agent_class = AGENT_CLASSES[flags['agent_class']]
def create_environment(game_type='Hanabi-Full', num_players=2): """Creates the Hanabi environment. Args: game_type: Type of game to play. Currently the following are supported: Hanabi-Full: Regular game. Hanabi-Small: The small version of Hanabi, with 2 cards and 2 colours. num_players: Int, number of players to play this game. Returns: A Hanabi environment. """ return rl_env.make(environment_name=game_type, num_players=num_players, pyhanabi_path=None)
def __init__(self, flags): """Initialize runner.""" self.flags = flags self.env = rl_env.make('Hanabi-Full', num_players=flags['players']) # create configurations self.agent_config, self.agent_2_config = self.generate_config(flags) # use configurations to create agent self.agent = load_agent(flags['agent_class'])(self.agent_config) if flags['agent2_class'] != flags['agent_class']: # use configurations to create second agent self.agent2 = load_agent(flags['agent2_class'])( self.agent_2_config)
def __init__(self, game_type, agents): self.num_players = len(agents) self.game_type = game_type self.history_size = 1 self.env = rl_env.make(environment_name=self.game_type, num_players=self.num_players) self.py_env = PyhanabiEnvWrapper(self.env) self.observation_size = 1041 self.max_moves = self.env.num_moves() self.agents = agents for agent in agents: agent.eval_mode = True
def __init__(self, flags): """Initialize runner.""" self.num_episodes = flags['num_episodes'] self.environment = batched_py_environment.BatchedPyEnvironment( [rl_env.make('Hanabi-Full', num_players=flags['players'])]) self.agent_config = { 'max_information_tokens': self.environment.envs[0].game.max_information_tokens(), 'action_spec': self.environment.action_spec(), 'observation_spec': self.environment.observation_spec(), 'environment_batch_size': self.environment.batch_size } self.agent_1 = AGENT_CLASSES[flags['agent_1']](self.agent_config) self.agent_2 = AGENT_CLASSES[flags['agent_2']](self.agent_config)
def run(self): for episode in range(self.episodes): environment = rl_env.make('Hanabi-Full', num_players=len(self.agents)) observations = environment.reset() agents = self.agents for agent in agents: agent.reset(self.config) done = False episode_reward = 0 turns = 0 player_id = -1 current_player_action = None while not done: for agent_id, agent in enumerate(agents): agent.id = agent_id observation = observations['player_observations'][agent_id] if observations['current_player'] == agent_id: action = agent.act(observation) assert action is not None current_player_action = action player_id = agent_id observations, reward, done, unused_info = environment.step( current_player_action) if current_player_action is not None: hand = None # If an action is a reveal we need to pass the target hand if current_player_action["action_type"].startswith( "REVEAL"): observed_hands = observations['player_observations'][ player_id]["observed_hands"] hand = observed_hands[ current_player_action["target_offset"]] # Pass action to agents for agent_id, agent in enumerate(agents): agent.on_action(player_id, current_player_action, hand) # Make an environment step. episode_reward += reward turns += 1 self.rewards[episode] = episode_reward
def __init__( self, root_dir, game_type, num_players, actor_fc_layers=(150, 75), value_fc_layers=(150, 75), actor_fc_layers_rnn=(150, ), value_fc_layers_rnn=(150, ), lstm_size=(75, 75), use_rnns=False, ): tf.reset_default_graph() self.sess = tf.Session() tf.compat.v1.enable_resource_variables() pyhanabi_env = rl_env.make(environment_name=game_type, num_players=num_players) py_env = PyhanabiEnvWrapper(pyhanabi_env) tf_env = tf_py_environment.TFPyEnvironment(py_env) with self.sess.as_default(): # init the agent if use_rnns: actor_net = masked_networks.MaskedActorDistributionRnnNetwork( tf_env.observation_spec(), tf_env.action_spec(), input_fc_layer_params=actor_fc_layers_rnn, output_fc_layer_params=None, lstm_size=lstm_size, ) value_net = MaskedValueNetwork(tf_env.observation_spec(), fc_layer_params=value_fc_layers) else: actor_net = masked_networks.MaskedActorDistributionNetwork( tf_env.observation_spec(), tf_env.action_spec(), fc_layer_params=actor_fc_layers) value_net = MaskedValueNetwork(tf_env.observation_spec(), fc_layer_params=value_fc_layers) global_step = tf.compat.v1.train.get_or_create_global_step( ) # necessary ??? tf_agent = ppo_agent.PPOAgent( tf_env.time_step_spec(), tf_env.action_spec(), actor_net=actor_net, value_net=value_net, train_step_counter=global_step, normalize_observations=False ) # cause the observations also include the 0-1 mask self.policy = py_tf_policy.PyTFPolicy(tf_agent.policy) # load checkpoint train_dir = os.path.join(root_dir, 'train') self.policy.initialize(None) self.policy.restore(root_dir) init_agent_op = tf_agent.initialize() self.sess.run(init_agent_op)
def load_hanabi_env(env_name="Hanabi-Full", num_players=4): pyhanabi_env = rl_env.make(environment_name=env_name, num_players=num_players) py_env = pyhanabi_env_wrapper.PyhanabiEnvWrapper(pyhanabi_env) return py_env
def load_env(variant="Hanabi-Small", players=4): pyhanabi_env = rl_env.make(environment_name=variant, num_players=players) py_env = pyhanabi_env_wrapper.PyhanabiEnvWrapper(pyhanabi_env) return py_env
def __init__(self, agent_class, numAgents=-1, load=False, size=1000000): """ Args: agent_class (string): the class of the agent, which can be one of: - 'SimpleAgent' - 'RainbowAgent' - 'RandomAgent' numAgents (int, optional): the number of agents load (boolean, optional): whether we have to load possible existent data of the given class of agents. size (int, optional): how many steps are going to be saved, default is 100K. This size is used to allocate memory at the beginning """ self.size = size self.ptr = 0 self.ep_start_id = self.ptr self.full = False self.path = os.path.join(self.path, agent_class) if not load and numAgents == -1: print( "Bad parameter initialization. Use either 'numAgents' or 'load' to initialize the object.") exit() else: if load: # load the configurations from file self.config = pickle.load( open(os.path.join(self.path, "config.pickle"), "rb")) numAgents = self.config["numAgents"] else: self.config = {} # create empty dict self.config["numAgents"] = numAgents # insert config data try: # detect the size of the observations env = rl_env.make(num_players=numAgents) obs = env.reset() self.config["size_obs"] = len( obs['player_observations'][0]['vectorized']) # detect the size of move self.n_moves = env.num_moves() # initialize matrices for all values self.moves = np.empty(size, dtype=np.uint8) self.rs = np.empty(size) self.obs = np.empty((size, self.config["size_obs"]), dtype=bool) self.eps = [] # initialize last episode self.last_ep = -1 except BaseException: # if the environment can't be create, we still can load if numAgents == 2 or numAgents == 3: self.n_cards = 5 elif numAgents == 4 or numAgents == 4: self.n_cards = 4 else: print("ERROR: invalid number of players") return self.n_moves = numAgents * 10 + self.n_cards * 2 print("WARNING: the environment could not be created.") print("Some functionality may be compromised. You CAN still load data.")
def create_environment(game_type='Hanabi-Full', num_players=2): """Creates the Hanabi environment.""" return rl_env.make(environment_name=game_type, num_players=num_players, pyhanabi_path=None)
def run_verbose_mode(agent_1, agent_2): env = rl_env.make('Hanabi-Full-CardKnowledge', num_players=2) tf_env = tf_py_environment.TFPyEnvironment(env) state = tf.env.reset()