def ffa_evaluate(env: Pomme, episodes, verbose, visualize, stop=False): """ Evaluates the given pommerman environment (already includes the agents). :param episodes: The number of episodes :param verbose: Whether to print verbose status information :param visualize: Whether to visualize the execution :param stop: Whether to wait for input after each step :return: The results of the evaluation of shape (episodes, 5) where the first column [:, 0] contains the result of the match (tie, win, incomplete) and the remaining columns contain the individual (final) rewards. """ # first element: result, additional elements: rewards steps = np.empty(episodes) results = np.empty((episodes, 1 + 4)) start = time.time() # Run the episodes just like OpenAI Gym for i_episode in range(episodes): state = env.reset() done = False reward = [] info = {} step = 0 while not done: if visualize: env.render() actions = env.act(state) state, reward, done, info = env.step(actions) step += 1 if stop: input() steps[i_episode] = step result = info['result'] # save the result results[i_episode, 0] = result.value results[i_episode, 1:] = reward if verbose: delta = time.time() - start print('\r{:.2f} sec > Episode {} finished with {} ({})'.format( delta, i_episode, result, reward)) if i_episode % 10 == 9 and i_episode != episodes - 1: ffa_print_stats(results, steps, i_episode + 1) env.close() if verbose: delta = time.time() - start print("Total time: {:.2f} sec".format(delta)) ffa_print_stats(results, steps, episodes) return results
def main(): # Print all possible environments in the Pommerman registry print(pommerman.registry) config = ffa_v1_env() env = Pomme(**config["env_kwargs"]) # Add 3 agents agents = {} for agent_id in range(4): agents[agent_id] = SimpleAgent(config["agent"](agent_id, config["game_type"])) # agents[3] = PlayerAgent(config["agent"](agent_id, config["game_type"]), "arrows") env.set_agents(list(agents.values())) env.set_init_game_state(None) demo = [] # Run the episodes just like OpenAI Gym for i_episode in range(1): state = env.reset() done = False demo.append(env.get_json_info()) while not done: env.render() actions = env.act(state) state, reward, done, info = env.step(actions) demo.append(env.get_json_info()) if 1 in reward: winner = reward.index(1) else: winner = None print('Episode {} finished'.format(i_episode)) env.close() # If game not tied, save demonstration if winner is not None: demonstration = {'demo': demo, 'winner': winner} pickle.dump(demonstration, open("demonstration.p", "wb"))
agents = {} for agent_id in range(3): agents[agent_id] = StaticAgent(config["agent"](agent_id, config["game_type"])) # Add human agent agent_id += 1 agents[3] = PlayerAgent(config["agent"](agent_id, config["game_type"]), "arrows") env.set_agents(list(agents.values())) env.set_init_game_state(None) # Seed and reset the environment env.seed(0) obs = env.reset() # Run the agents until we're done done = False while not done: env.render() actions = env.act(obs) # brauch ich nicht #actions = [action % 4 for action in actions] #actions = [0,actions[1]] obs, reward, done, info = env.step(actions) #kacka = featurize(obs[0]) env.render(close=True) env.close() # Print the result print(info)
class PomFFA(gym.Env): agent_list = [HoldAgent(), HoldAgent(), HoldAgent(), HoldAgent()] all_obs = None all_action = None cur_obs = None alive_agents = [10, 11, 12, 13] player_agent_id = 10 def __init__(self, env_config=None): pomme_config = pommerman.configs.ffa_competition_env() if env_config: for k, v in env_config.items(): if k in pomme_config['env_kwargs']: pomme_config['env_kwargs'][k] = v print("pomme_config: ") print(pomme_config['env_kwargs']) self.pomme = Pomme(**pomme_config['env_kwargs']) self.observation_space = self.init_observation_space( pomme_config['env_kwargs']) self.action_space = self.pomme.action_space self.total_reward = 0 self.prev_alive = 4 self.visited = np.zeros(shape=(11, 11)) if not env_config or (env_config and env_config.get("is_training", True)): # initialize env twice could raise error here. self.init(pomme_config) def init(self, pomm_config): for id_, agent in enumerate(self.agent_list): assert isinstance(agent, agents.BaseAgent) print(id_, pomm_config['game_type']) agent.init_agent(id_, pomm_config['game_type']) self.pomme.set_agents(self.agent_list) self.pomme.set_init_game_state(None) def reset(self): obs = self.pomme.reset() self.all_obs = obs obs = self.get_for_training_agent(obs) self.cur_obs = obs obs = self.preproess(obs) self.total_reward = 0 self.prev_alive = 4 self.visited = np.zeros(shape=(11, 11)) return obs def get_reward(self, obs, action, agent_id): if len(obs["alive"]) == 1: # An agent won. Give them +1, others -1. if agent_id in obs['alive']: return 1.0 - self.total_reward else: return -0.5 if obs["step_count"] >= 500: # Game is over from time. Everyone gets -1. return -0.5 # Game running: 0 for alive, -1 for dead. if agent_id not in obs['alive']: return -0.5 x, y = obs["position"] blast = obs["bomb_blast_strength"] px = [1, -1, 0, 0] py = [0, 0, -1, 1] sum_reward = 0.0 sum_reward += 20 * (len(obs["alive"]) - self.prev_alive) self.prev_alive = len(obs["alive"]) if action == 0: sum_reward -= 0.1 elif action == 5: # sum_reward += 1 for i in range(4): tx = x + px[i] ty = y + py[i] if tx < 0 or tx > 10 or ty < 0 or ty > 10: continue if obs["board"][tx][ty] == 1: sum_reward += 2 elif obs["board"][tx][ty] > 10: sum_reward += 4 else: assert (1 <= action <= 4), str(action) dx = x + px[action - 1] dy = y + py[action - 1] if (not (dx < 0 or dx > 10 or dy < 0 or dy > 10)) and obs["board"][dx][dy] == 0: if self.visited[dx][dy] > 0: sum_reward -= 0.1 else: sum_reward += 0.3 self.visited[dx][dy] = 1 sum_reward = sum_reward * 1.0 / 100.0 new_total_reward = self.total_reward + sum_reward if new_total_reward > 0.8 or new_total_reward < -0.5: sum_reward = 0.0 else: self.total_reward = new_total_reward return sum_reward def step(self, action): actions = self.pomme.act(self.all_obs) if self.alive_agents and self.player_agent_id in self.alive_agents: actions = self.set_for_training_agent(actions, action) else: actions = self.set_for_training_agent(actions, 0) obs, rewards, done, info = self.pomme.step(actions) self.all_obs = obs obs = self.get_for_training_agent(obs) self.cur_obs = obs reward = self.get_reward(self.cur_obs, action, self.player_agent_id) self.alive_agents = obs['alive'] if (self.player_agent_id not in self.alive_agents) or obs["step_count"] >= 500: done = True obs = self.preproess(obs) return obs, reward, done, {} def get_for_training_agent(self, inputs): order = self.player_agent_id - 10 return inputs[order].copy() def set_for_training_agent(self, inputs, value): order = self.player_agent_id - 10 inputs[order] = value return inputs def init_observation_space(self, env_config): """ observations for agents board: n^2 bomb blast strength: n^2 bomb life: n^2 """ board_size = env_config['board_size'] or 11 num_items = env_config['num_items'] or 11 print("env config: {}".format(env_config)) # board_size = 11 board = spaces.Box(low=0, high=len(constants.Item), shape=(board_size, board_size)) danger = spaces.Box(low=0, high=20, shape=(board_size, board_size)) bomb_blast_strength = spaces.Box(low=0, high=num_items, shape=(board_size, board_size)) bomb_life = spaces.Box(low=0, high=9, shape=(board_size, board_size)) flame_life = spaces.Box(low=0, high=10, shape=(board_size, board_size)) position = spaces.Box(low=0, high=board_size, shape=(2, )) blast_strength = spaces.Box(low=1, high=num_items, shape=(1, )) ammo = spaces.Box(low=0, high=num_items, shape=(1, )) # return spaces.Dict({"board": board, # "bomb_blast_strength": bomb_blast_strength, "bomb_life": bomb_life, # "flame_life": flame_life, # "position": position, "ammo": ammo, "blast_strength": blast_strength}) return spaces.Dict({ "board": board, "bomb_blast_strength": bomb_blast_strength, "bomb_life": bomb_life, "flame_life": flame_life, "position": position, "ammo": ammo, "blast_strength": blast_strength, "danger": danger }) @staticmethod def preproess(obs): del obs["game_type"] del obs["game_env"] del obs["can_kick"] del obs["teammate"] del obs["enemies"] del obs["step_count"] del obs['alive'] del obs['bomb_moving_direction'] obs['position'] = np.array(obs['position']) obs['ammo'] = np.array([obs['ammo']]) obs['blast_strength'] = np.array([obs['blast_strength']]) board = obs['board'] bomb_blast_strength = obs['bomb_blast_strength'] bomb_life = obs['bomb_life'] # flame_life = obs['flame_life'] # position = obs['position'] # ammo = obs['ammo'] # blast_strength = obs['blast_strength'] danger = np.ndarray(shape=(11, 11), dtype=int) for x in range(11): for y in range(11): danger[x][y] = 10 if board[x][y] == 4: board[x][y] = 0 danger[x][y] = 0 elif board[x][y] == 3: board[x][y] = 0 elif board[x][y] == 10: board[x][y] = 1 elif board[x][y] > 10: board[x][y] = 5 elif 6 <= board[x][y] <= 8: board[x][y] = 3 elif board[x][y] == 1: board[x][y] = 4 for x in range(11): for y in range(11): if bomb_life[x][y] > 0: strength = int(bomb_blast_strength[x][y] + 0.5) for tx in range(max(0, x - strength + 1), min(11, x + strength)): danger[tx][y] = min(danger[tx][y], bomb_life[x][y]) for ty in range(max(0, y - strength + 1), min(11, y + strength)): danger[x][ty] = min(danger[x][ty], bomb_life[x][y]) obs['danger'] = danger return obs def render(self): self.pomme.render()
class MultiAgent(MultiAgentEnv): def __init__(self): super(MultiAgent, self).__init__() self.phase = 0 self.setup() def setup(self): agents = [] if self.phase == 0: self.agents_index = [1, 3] self.enemies_agents_index = [0, 2] config = team_v0_fast_env() config["env_kwargs"]["num_wood"] = 2 config["env_kwargs"]["num_items"] = 2 config["env_kwargs"]["num_rigid"] = 20 agents.insert( 0, SuicidalAgent(config["agent"](0, config["game_type"]))) agents.insert(2, NoDoAgent(config["agent"](2, config["game_type"]))) print(config["env_kwargs"]) self.env = Pomme(**config["env_kwargs"]) self.env.seed() if self.phase == 1: self.agents_index = [1, 3] self.enemies_agents_index = [0, 2] config = team_v0_fast_env() config["env_kwargs"]["num_wood"] = 2 config["env_kwargs"]["num_items"] = 2 config["env_kwargs"]["num_rigid"] = 36 agents.insert( 0, SuicidalAgent(config["agent"](0, config["game_type"]))) agents.insert(2, NoDoAgent(config["agent"](2, config["game_type"]))) print(config["env_kwargs"]) self.env = Pomme(**config["env_kwargs"]) self.env.seed() if self.phase == 2: self.agents_index = [1, 3] self.enemies_agents_index = [0, 2] config = team_v0_fast_env() config["env_kwargs"]["num_wood"] = 2 config["env_kwargs"]["num_items"] = 2 config["env_kwargs"]["num_rigid"] = 36 agents.insert(0, NoDoAgent(config["agent"](0, config["game_type"]))) agents.insert(2, NoDoAgent(config["agent"](2, config["game_type"]))) print(config["env_kwargs"]) self.env = Pomme(**config["env_kwargs"]) self.env.seed() if self.phase == 3: self.agents_index = [1, 3] self.enemies_agents_index = [0, 2] config = team_v0_fast_env() config["env_kwargs"]["num_wood"] = 2 config["env_kwargs"]["num_items"] = 2 config["env_kwargs"]["num_rigid"] = 36 agents.insert(0, NoDoAgent(config["agent"](0, config["game_type"]))) agents.insert(2, NoDoAgent(config["agent"](2, config["game_type"]))) print(config["env_kwargs"]) self.env = Pomme(**config["env_kwargs"]) self.env.seed() if self.phase == 4: self.agents_index = [1, 3] self.enemies_agents_index = [0, 2] config = team_v0_fast_env() config["env_kwargs"]["num_wood"] = 0 config["env_kwargs"]["num_items"] = 10 config["env_kwargs"]["num_rigid"] = 36 agents.insert( 0, SuicidalAgent(config["agent"](0, config["game_type"]))) agents.insert(2, SimpleAgent(config["agent"](2, config["game_type"]))) print(config["env_kwargs"]) self.env = Pomme(**config["env_kwargs"]) self.env.seed() for agent_id in self.agents_index: agents.insert( agent_id, BaseLineAgent(config["agent"](agent_id, config["game_type"]))) self.env.set_agents(agents) self.env.set_init_game_state(None) self.observation_space = spaces.Dict({ "boards": spaces.Box(low=-1, high=20, shape=(3, 11, 11)), "states": spaces.Box(low=-1, high=20, shape=(9, )), }) spaces.Box(low=-1.0, high=20.0, shape=(372, ), dtype=np.float32) self.action_space = self.env.action_space def set_phase(self, phase): print("learn phase " + str(phase)) self.phase = phase self.setup() self.reset() def step(self, actions): obs = self.env.get_observations() all_actions = self.env.act(obs) for index in self.agents_index: try: action = actions[index] except: action = 0 all_actions[index] = action step_obs = self.env.step(all_actions) obs, rew, done, info = {}, {}, {}, {} for i in actions.keys(): obs[i], rew[i], done[i], info[i] = [ featurize(step_obs[0][i]), step_obs[1][i], step_obs[1][i] == -1 or step_obs[2], step_obs[3], ] done["__all__"] = step_obs[2] return obs, rew, done, info def reset(self): obs = self.env.reset() return {i: featurize(obs[i]) for i in self.agents_index}
class MultiAgend(MultiAgentEnv): def __init__(self): super(MultiAgend, self).__init__() self.phase = 0 self.next_phase = 0 self.steps = 0 self.last_featurize_obs = None self.setup() def featurize(self, obs): enemies = [] for agent_id in self.enemies_agents_index: if agent_id == 0: enemies.append(Item.Agent0) if agent_id == 1: enemies.append(Item.Agent1) if agent_id == 2: enemies.append(Item.Agent2) if agent_id == 3: enemies.append(Item.Agent3) for enemie in obs["enemies"]: if enemie not in enemies: obs["board"] = ma.masked_equal( obs["board"], enemie.value).filled(fill_value=0) board = np.copy(obs["board"]) board[obs["position"][0], obs["position"][1]] = 0.0 enemie_pos = np.full((11, 11), 0) for enemie in obs["enemies"]: enemie_pos = enemie_pos | ma.masked_not_equal( board, enemie.value).filled(fill_value=0) board = ma.masked_equal(board, enemie.value).filled(fill_value=0) wood = ma.masked_not_equal(board, 2).filled(fill_value=0) wood = (wood > 0).astype(np.float32) board = ma.masked_equal(board, 2).filled(fill_value=0) stone = ma.masked_not_equal(board, 1).filled(fill_value=0) stone = (stone > 0).astype(np.float32) board = ma.masked_equal(board, 1).filled(fill_value=0) enemie_pos = (enemie_pos > 0).astype(np.float32) board = ma.masked_equal(board, obs["teammate"].value).filled(fill_value=0) flames = ma.masked_not_equal(board, 4).filled(fill_value=0) flames = (flames > 0).astype(np.float32) board = ma.masked_equal(board, 4).filled(fill_value=0) board = ma.masked_equal(board, 3).filled(fill_value=0) teammate_pos = ma.masked_not_equal( board, obs["teammate"].value).filled(fill_value=0) teammate_pos = (teammate_pos > 0).astype(np.float32) board = ma.masked_equal(board, obs["teammate"].value).filled(fill_value=0) items = board.astype(np.float32) pos = np.full((11, 11), 0) pos[obs["position"][0], obs["position"][1]] = 1.0 pos = pos.astype(np.float32) bomb_life = obs["bomb_life"].astype(np.float32) bomb_blast_strength = obs["bomb_blast_strength"].astype(np.float32) ammo = utility.make_np_float([obs["ammo"]]) blast_strength = utility.make_np_float([obs["blast_strength"]]) can_kick = utility.make_np_float([obs["can_kick"]]) game_end = utility.make_np_float([ (self.max_steps - self.steps) / self.max_steps ]) actual_featurize_obs = { 'boards': np.stack([ enemie_pos, pos, wood, stone, items, flames, teammate_pos, bomb_life, bomb_blast_strength ], axis=0), 'states': np.concatenate([ammo, blast_strength, can_kick, game_end]), } if self.last_featurize_obs == None: featurize_obs = { 'boards': np.concatenate([ actual_featurize_obs['boards'], actual_featurize_obs['boards'] ], axis=0), 'states': np.concatenate([ actual_featurize_obs['states'], actual_featurize_obs['states'] ]), } else: featurize_obs = { 'boards': np.concatenate([ self.last_featurize_obs['boards'], actual_featurize_obs['boards'] ], axis=0), 'states': np.concatenate([ self.last_featurize_obs['states'], actual_featurize_obs['states'] ]), } self.last_featurize_obs = actual_featurize_obs return featurize_obs def setup(self): agents = [] if self.phase == 0: arr = [0, 1] random.shuffle(arr) agents_index = arr.pop() op_index = arr.pop() self.agents_index = [agents_index] self.enemies_agents_index = [op_index] self.max_steps = 200 config = ffa_v0_fast_env() config["env_kwargs"]["max_steps"] = self.max_steps agents.insert( agents_index, BaseLineAgent(config["agent"](agents_index, config["game_type"]))) agents.insert( op_index, NoDoAgent(config["agent"](op_index, config["game_type"]))) self.env = Pomme(**config["env_kwargs"]) self.env.set_agents(agents) init_state = { 'board_size': '11', 'step_count': '0', 'board': '', 'agents': '[{"agent_id": 0, "is_alive": true, "position": [1, 1], "ammo": 1, "blast_strength": 2, "can_kick": false}, {"agent_id": 1, "is_alive": true, "position": [9, 0], "ammo": 1, "blast_strength": 2, "can_kick": false}]', 'bombs': '[]', 'flames': '[]', 'items': '[]', 'intended_actions': '[0, 0]' } board = np.full((11, 11), 0) init_state['board'] = json.dumps(board.tolist()) agents_json = json.loads(copy.copy(init_state['agents'])) random_pos = np.random.choice(board.shape[0], (2, 2), replace=False) agents_json[0]["position"] = random_pos[0].tolist() agents_json[1]["position"] = random_pos[1].tolist() init_state['agents'] = json.dumps(agents_json) self.env._init_game_state = init_state self.env.reset() if self.phase == 1: arr = [0, 1] random.shuffle(arr) agents_index = arr.pop() op_index = arr.pop() self.agents_index = [agents_index] self.enemies_agents_index = [op_index] self.max_steps = 200 config = ffa_v0_fast_env() config["env_kwargs"]["max_steps"] = self.max_steps agents.insert( agents_index, BaseLineAgent(config["agent"](agents_index, config["game_type"]))) agents.insert( op_index, NoDoAgent(config["agent"](op_index, config["game_type"]))) self.env = Pomme(**config["env_kwargs"]) self.env.set_agents(agents) init_state = { 'board_size': '11', 'step_count': '0', 'board': '', 'agents': '[{"agent_id": 0, "is_alive": true, "position": [1, 1], "ammo": 1, "blast_strength": 2, "can_kick": false}, {"agent_id": 1, "is_alive": true, "position": [9, 0], "ammo": 1, "blast_strength": 2, "can_kick": false}]', 'bombs': '[]', 'flames': '[]', 'items': '[]', 'intended_actions': '[0, 0]' } board = np.full((11, 11), 0) board[5, :] = (np.ones(11) * 2) agents_json = json.loads(copy.copy(init_state['agents'])) agents_json[0]["position"] = [ random.randint(0, 4), random.randint(0, 10) ] agents_json[1]["position"] = [ random.randint(6, 10), random.randint(0, 10) ] init_state['agents'] = json.dumps(agents_json) init_state['board'] = json.dumps(board.tolist()) self.env._init_game_state = init_state self.env.reset() self.observation_space = spaces.Dict({ 'boards': spaces.Box(low=-1, high=25, shape=(11, 11, 18), dtype=np.float32), 'states': spaces.Box(low=-1, high=25, shape=(8, ), dtype=np.float32) }) self.action_space = self.env.action_space def set_phase(self, phase): print("learn phase " + str(phase)) self.next_phase = phase def close(self): self.env.close() def step(self, actions): self.steps = self.steps + 1 obs = self.env.get_observations() all_actions = self.env.act(obs) assert (len(all_actions) == len(self.agents_index) + len(self.enemies_agents_index)) for index in self.agents_index: try: action = actions[index] except: action = 0 assert (all_actions[index] == None) all_actions[index] = action step_obs = self.env.step(all_actions) obs, rew, done, info = {}, {}, {}, {} for i in actions.keys(): obs[i], rew[i], done[i], info[i] = [ self.featurize(step_obs[0][i]), step_obs[1][i], step_obs[1][i] == -1 or step_obs[2], step_obs[3] ] done["__all__"] = step_obs[2] return obs, rew, done, info def reset(self): self.steps = 0 self.phase = self.next_phase self.setup() obs = self.env.get_observations() return {i: self.featurize(obs[i]) for i in self.agents_index}
class PomFFA(gym.Env): def __init__(self, env_config=None): self.agent_list = [HoldAgent(), agents.SimpleAgent(), HoldAgent(), HoldAgent()] # self.agent_list = [agents.SimpleAgent(), agents.SimpleAgent(), agents.SimpleAgent(), agents.RandomAgent()] self.all_obs = None self.all_action = None self.cur_obs = None self.alive_agents = [10, 11, 12, 13] self.player_agent_id = 10 self.total_reward = 0 pomme_config = pommerman.configs.ffa_competition_env() if env_config: for k, v in env_config.items(): if k in pomme_config['env_kwargs']: pomme_config['env_kwargs'][k] = v self.pomme = Pomme(**pomme_config['env_kwargs']) self.observation_space = self.init_observation_space(pomme_config['env_kwargs']) self.action_space = self.pomme.action_space if not env_config or (env_config and env_config.get("is_training", True)): # initialize env twice could raise error here. self.init(pomme_config) def init(self, pomm_config): for id_, agent in enumerate(self.agent_list): assert isinstance(agent, agents.BaseAgent) print(id_, pomm_config['game_type']) agent.init_agent(id_, pomm_config['game_type']) self.pomme.set_agents(self.agent_list) self.pomme.set_init_game_state(None) def reset(self): obs = self.pomme.reset() self.all_obs = obs.copy() obs = self.get_for_training_agent(obs) self.cur_obs = obs.copy() obs = self.preproess(obs) self.total_reward = 0 return obs def get_reward(self, obs, action, agent_id): if len(obs["alive"]) == 1: # An agent won. Give them +1, others -1. if agent_id in obs['alive']: return 0.5 else: return -0.5 if obs["step_count"] >= 500: # Game is over from time. Everyone gets -1. return -0.5 # Game running: 0 for alive, -1 for dead. if agent_id not in obs['alive']: return -0.5 x, y = obs["position"] # blast = obs["bomb_blast_strength"] px = [0, 1, 0, -1] py = [1, 0, -1, 0] sum_reward = 0 if action == 5: for i in range(4): tx = x+px[i] ty = y+py[i] if tx<0 or tx>10 or ty<0 or ty>10: continue if obs["board"][tx][ty] == 1: sum_reward += 1 elif obs["board"][tx][ty] > 10: sum_reward += 4 sum_reward = sum_reward*1.0/200.0 new_total_reward = self.total_reward + sum_reward if new_total_reward > 0.5 or new_total_reward < -0.5: sum_reward = 0 else: self.total_reward = new_total_reward return sum_reward def step(self, action): actions = self.pomme.act(self.all_obs) if self.alive_agents and self.player_agent_id in self.alive_agents: actions = self.set_for_training_agent(actions, action) else: actions = self.set_for_training_agent(actions, 0) obs, rewards, done, info = self.pomme.step(actions) # print(obs) del self.all_obs self.all_obs = obs.copy() obs = self.get_for_training_agent(obs) del self.cur_obs self.cur_obs = obs.copy() reward = self.get_reward(self.cur_obs, action, self.player_agent_id) self.alive_agents = obs['alive'] if self.player_agent_id not in self.alive_agents or self.cur_obs["step_count"] >= 500: done = True obs = self.preproess(obs) return obs, reward, done, {} def get_for_training_agent(self, inputs): order = self.player_agent_id - 10 return inputs[order] def set_for_training_agent(self, inputs, value): order = self.player_agent_id - 10 inputs[order] = value return inputs def init_observation_space(self, env_config): """ observations for agents board: n^2 bomb blast strength: n^2 bomb life: n^2 """ board_size = env_config['board_size'] num_items = env_config['num_items'] # print("env config: {}".format(env_config)) # board_size = 11 board = spaces.Box(low=0, high=len(constants.Item), shape=(board_size, board_size)) bomb_blast_strength = spaces.Box(low=0, high=num_items, shape=(board_size, board_size)) bomb_life = spaces.Box(low=0, high=9, shape=(board_size, board_size)) flame_life = spaces.Box(low=0, high=3, shape=(board_size, board_size)) position = spaces.Box(low=0, high=board_size, shape=(2,)) blast_strength = spaces.Box(low=1, high=num_items, shape=(1,)) ammo = spaces.Box(low=0, high=num_items, shape=(1,)) return spaces.Dict({"board": board, "bomb_blast_strength": bomb_blast_strength, "bomb_life": bomb_life, "flame_life": flame_life, "position": position, "ammo": ammo, "blast_strength": blast_strength}) @staticmethod def preproess(obs): del obs["game_type"] del obs["game_env"] del obs["can_kick"] del obs["teammate"] del obs["enemies"] del obs["step_count"] del obs['alive'] del obs['bomb_moving_direction'] obs['position'] = np.array(obs['position']) obs['ammo'] = np.array([obs['ammo']]) obs['blast_strength'] = np.array([obs['blast_strength']]) return obs def render(self): self.pomme.render()
class PomFFA(gym.Env): agent_list = [ agents.StaticAgent(), agents.StaticAgent(), agents.StaticAgent(), agents.StaticAgent() ] alive_agents = [10, 11, 12, 13] agent_id = 10 ammo = 1 blast_strength = 2 state = {} def __init__(self, env_config={}): pomme_config = pommerman.configs.ffa_competition_env() self.reward = Reward(env_config.get("reward")) self.pomme = Pomme(**pomme_config['env_kwargs']) self.observation_space = self.init_observation_space( pomme_config['env_kwargs']) self.action_space = self.pomme.action_space if not env_config or (env_config and env_config.get("is_training", True)): # initialize env twice could raise error here. self.init(pomme_config) def init(self, pomm_config): for id_, agent in enumerate(self.agent_list): assert isinstance(agent, agents.BaseAgent) agent.init_agent(id_, pomm_config['game_type']) self.pomme.set_agents(self.agent_list) self.pomme.set_init_game_state(None) self.init_state() def init_state(self): self.state['agent_id'] = self.agent_id self.state['alive'] = self.alive_agents self.state['visited'] = set() self.state['blast_strength'] = self.blast_strength self.state['ammo'] = self.ammo self.state["bombs"] = {} def reset(self): all_obs = self.pomme.reset() obs = self.get_for_training_agent(all_obs) self.init_state() self.state['prev_obs'] = copy.deepcopy(obs) self.state['all_obs'] = all_obs self.state['alive'] = obs['alive'] obs = self.build_obs(obs, self.state) return obs def step(self, action): actions = self.pomme.act(self.state['all_obs']) actions = self.set_for_training_agent(actions, action) all_obs, _, _, _ = self.pomme.step(actions) obs = self.get_for_training_agent(all_obs) info = {'board': obs['board'], 'blast_strength': obs['blast_strength']} done = self.get_done(obs) reward, self.state = self.reward.get_reward(action, obs, self.state) self.state['prev_obs'] = copy.deepcopy(obs) self.state['all_obs'] = all_obs self.state['alive'] = obs['alive'] self.state['blast_strength'] = obs['blast_strength'] self.state['ammo'] = obs['ammo'] obs = self.build_obs(obs, self.state) return obs, reward, done, info def get_for_training_agent(self, inputs): order = self.agent_id - 10 return inputs[order] def set_for_training_agent(self, inputs, value): order = self.agent_id - 10 inputs[order] = value return inputs def get_done(self, obs): if self.agent_id not in obs['alive']: return True if obs['step_count'] >= 800: return True return False def build_obs(self, obs, state): board = obs['board'] bomb_blast_strength = obs['bomb_blast_strength'] bomb_life = obs['bomb_life'] flame_life = obs['flame_life'] agent_id = state['agent_id'] ammo = state['ammo'] passage = np.zeros_like(board) wall = np.zeros_like(board) wood = np.zeros_like(board) bomb = np.zeros_like(board) bonus = np.zeros_like(board) me = np.zeros_like(board) enemy = np.zeros_like(board) for y in range(board.shape[0]): for x in range(board.shape[1]): v = board[y][x] if v == 0: passage[y][x] = 1 elif v == 1: wall[y][x] = 1 elif v == 2: wood[y][x] = 1 elif v == 3: bomb = create_cross(bomb, (y, x), bomb_blast_strength[y][x]) elif v == 4: pass elif v == 6 or v == 7: bonus[y][x] = 1 elif v >= 10: if v == agent_id: me[y][x] = 1 else: enemy[y][x] = 1 if bomb_blast_strength[y][x] > 0: bomb = create_cross(bomb, (y, x), bomb_blast_strength[y][x]) ammo = ammo * np.ones_like(board) / 12 bomb_life /= 9 flame_life /= 3 board = np.transpose( np.stack([ passage, wall, wood, bomb, bonus, me, enemy, bomb_life, flame_life, ammo ]), [1, 2, 0]) return board @staticmethod def init_observation_space(env_config): """ observations for agents board: n^2 bomb blast strength: n^2 bomb life: n^2 """ board_size = env_config['board_size'] num_items = env_config['num_items'] board = spaces.Box( low=0, high=1, shape=(board_size, board_size, 10)) # passage,wall,wood,bomb,bonus,me,enemies bomb_life = spaces.Box(low=0, high=9, shape=(board_size, board_size)) flame_life = spaces.Box(low=0, high=3, shape=(board_size, board_size)) ammo = spaces.Box(low=0, high=num_items, shape=(board_size, board_size)) # return spaces.Dict({"board": board, "bomb_life": bomb_life, "flame_life": flame_life,"ammo": ammo}) return board @staticmethod def init_action_space(): return spaces.Discrete(6) def render(self): self.pomme.render()
class Pomme_v0(MultiAgentEnv): ''' A wrapped Pommerman v0 environment for usage with Ray RLlib. The v0 environment is the base environment used in the NIPS'18 competition. Contrary to v1 it doesn't collapse walls and also doesn't allow for radio communication between agents (as does v2). Agents are identified by (string) agent IDs: `AGENT_IDS` (Note that these "agents" here are not to be confused with RLlib agents.) ''' def __init__(self, config=pommerman_cfg.team_competition_env()): ''' Initializes the Pommerman environment and adds Dummy Agents as expected by `Pomme`. Args: config (dict): A config defining the game mode. Options include FFA mode, team (2v2) and team radio (2v2). See pommerman's config.py and docs for more details. ''' self.pomme = Pomme(**config['env_kwargs']) self.observation_space = dict self.action_space = self.pomme.action_space self.agent_names = AGENT_IDS agent_list = [] for i in range(4): agent_id = i agent_list.append( agents.BaseAgent(config["agent"](agent_id, config["game_type"]))) self.pomme.set_agents(agent_list) self.pomme.set_init_game_state(None) def reset(self): """ Resets the env and returns observations from ready agents. Returns: obs (dict): New observations for each ready agent. """ obs_list = self.pomme.reset() #return {key: featurize(val) for key, val in to_dict(obs_list).items()} return {key: val for key, val in to_dict(obs_list).items()} def step(self, action_dict): """ Returns observations from ready agents. The returns are dicts mapping from agent_id strings to values. The number of agents in the env can vary over time. Returns: obs (dict): New observations for each ready agent. rewards (dict): Reward values for each ready agent. If the episode is just started, the value will be zero. dones (dict): Done values for each ready agent. The key "__all__" is used to indicate the end of the game. infos (dict): Info values for each ready agent. """ # default actions since Pommerman env expects actions even if agent is dead actions = {'agent_0': 0, 'agent_1': 0, 'agent_2': 0, 'agent_3': 0} # update actions with the ones returned from the policies actions.update(action_dict) # perform env step (expects a list) obs, rewards, done, info = self.pomme.step(list(actions.values())) # build 'dones' dictionary, key __all__ indicates env termination dones = {'__all__': done} # fetch all done_agents = to_dict( [not agent.is_alive for agent in self.pomme._agents]) # filter done dictionary to only return agents which are still alive # -> apparently this is how rllib determines when agents "die" dones.update({key: val for key, val in done_agents.items() if not val}) # turn info dict into dictionary with agent IDs as keys infos = { AGENT_IDS[i]: {info_k: info_v for info_k, info_v in info.items()} for i in range(NUM_PLAYERS) } return to_dict(obs), to_dict(rewards), dones, infos
class PomFFA(gym.Env): agent_list = [ agents.RandomAgent(), agents.SimpleAgent(), agents.SimpleAgent(), agents.SimpleAgent() ] all_obs = None all_action = None cur_obs = None alive_agents = [10, 11, 12, 13] player_agent_id = 10 def __init__(self, env_config=None): pomme_config = pommerman.configs.ffa_competition_env() if env_config: for k, v in env_config.items(): if k in pomme_config['env_kwargs']: pomme_config['env_kwargs'][k] = v print("pomme_config: ") print(pomme_config['env_kwargs']) self.pomme = Pomme(**pomme_config['env_kwargs']) self.observation_space = self.init_observation_space( pomme_config['env_kwargs']) self.action_space = self.pomme.action_space if not env_config or (env_config and env_config.get("is_training", True)): # initialize env twice could raise error here. self.init(pomme_config) def init(self, pomm_config): for id_, agent in enumerate(self.agent_list): assert isinstance(agent, agents.BaseAgent) print(id_, pomm_config['game_type']) agent.init_agent(id_, pomm_config['game_type']) self.pomme.set_agents(self.agent_list) self.pomme.set_init_game_state(None) def reset(self): obs = self.pomme.reset() self.all_obs = obs obs = self.get_for_training_agent(obs) self.cur_obs = obs obs = self.preproess(obs) return obs def get_reward(self, obs, action, agent_id): if len(obs["alive"]) == 1: # An agent won. Give them +1, others -1. if agent_id in obs['alive']: return 1 else: return -1 if obs["step_count"] >= 500: # Game is over from time. Everyone gets -1. return -1 # Game running: 0 for alive, -1 for dead. if agent_id not in obs['alive']: return -1 # # x, y = obs["position"] # blast = obs["bomb_blast_strength"] # # for w in range(11): # if blast[x][w] > int(math.fabs(w-y)): # return -10 # # if blast[w][y] > int(math.fabs((w-x))): # return -10 return 0 def step(self, action): actions = self.pomme.act(self.all_obs) if self.alive_agents and self.player_agent_id in self.alive_agents: actions = self.set_for_training_agent(actions, action) else: actions = self.set_for_training_agent(actions, 0) obs, rewards, done, info = self.pomme.step(actions) # print(obs) self.all_obs = obs obs = self.get_for_training_agent(obs) self.cur_obs = obs reward = self.get_reward(self.cur_obs, action, self.player_agent_id) self.alive_agents = obs['alive'] if (self.player_agent_id not in self.alive_agents) or obs["step_count"] >= 500: done = True obs = self.preproess(obs) return obs, reward, done, {} def get_for_training_agent(self, inputs): order = self.player_agent_id - 10 return inputs[order] def set_for_training_agent(self, inputs, value): order = self.player_agent_id - 10 inputs[order] = value return inputs def init_observation_space(self, env_config): """ observations for agents board: n^2 bomb blast strength: n^2 bomb life: n^2 """ board_size = env_config['board_size'] or 11 num_items = env_config['num_items'] or 11 print("env config: {}".format(env_config)) # board_size = 11 board = spaces.Box(low=0, high=len(constants.Item), shape=(board_size, board_size)) bomb_blast_strength = spaces.Box(low=0, high=num_items, shape=(board_size, board_size)) bomb_life = spaces.Box(low=0, high=9, shape=(board_size, board_size)) flame_life = spaces.Box(low=0, high=3, shape=(board_size, board_size)) position = spaces.Box(low=0, high=board_size, shape=(2, )) blast_strength = spaces.Box(low=1, high=num_items, shape=(1, )) ammo = spaces.Box(low=0, high=num_items, shape=(1, )) return spaces.Dict({ "board": board, "bomb_blast_strength": bomb_blast_strength, "bomb_life": bomb_life, "flame_life": flame_life, "position": position, "ammo": ammo, "blast_strength": blast_strength }) @staticmethod def preproess(obs): del obs["game_type"] del obs["game_env"] del obs["can_kick"] del obs["teammate"] del obs["enemies"] del obs["step_count"] del obs['alive'] del obs['bomb_moving_direction'] obs['position'] = np.array(obs['position']) obs['ammo'] = np.array([obs['ammo']]) obs['blast_strength'] = np.array([obs['blast_strength']]) return obs def render(self): self.pomme.render()
class PomFFA(gym.Env): agent_list = [ agents.StaticAgent(), agents.StaticAgent(), agents.StaticAgent(), agents.StaticAgent() ] all_obs = None all_action = None pre_obs = None alive_agents = [10, 11, 12, 13] agent_id = 10 state = {} def __init__(self, env_config=None): pomme_config = pommerman.configs.ffa_competition_env() if env_config: for k, v in env_config.items(): if k in pomme_config['env_kwargs']: pomme_config['env_kwargs'][k] = v self.reward = Reward(env_config.get("reward")) else: self.reward = Reward() print("Pommerman Config:", pomme_config['env_kwargs']) self.pomme = Pomme(**pomme_config['env_kwargs']) self.observation_space = self.init_observation_space( pomme_config['env_kwargs']) self.action_space = self.pomme.action_space if not env_config or (env_config and env_config.get("is_training", True)): # initialize env twice could raise error here. self.init(pomme_config) def init(self, pomm_config): for id_, agent in enumerate(self.agent_list): assert isinstance(agent, agents.BaseAgent) agent.init_agent(id_, pomm_config['game_type']) self.pomme.set_agents(self.agent_list) self.pomme.set_init_game_state(None) def reset(self): obs = self.pomme.reset() self.all_obs = obs obs = self.get_for_training_agent(obs) state = { "prev_obs": None, "visited": set(), "agent_id": 10, "alive": [10, 11, 12, 13], "strength": 2, "ammo": 1, "bombs": {}, } state['prev_obs'] = copy.deepcopy(obs) state['position'] = obs['position'] self.state = state obs = self.preproess(obs) return obs def step(self, action): actions = self.pomme.act(self.all_obs) actions = self.set_for_training_agent(actions, action) obs, rewards, _, _ = self.pomme.step(actions) self.all_obs = obs obs = self.get_for_training_agent(obs) reward, self.state = self.reward.get_reward(action, obs, self.state) done = self.get_done(obs) self.state['prev_obs'] = copy.deepcopy(obs) self.state['position'] = obs['position'] obs = self.preproess(obs) return obs, reward, done, {} def get_for_training_agent(self, inputs): order = self.agent_id - 10 return inputs[order] def set_for_training_agent(self, inputs, value): order = self.agent_id - 10 inputs[order] = value return inputs def get_done(self, obs): if self.agent_id not in obs['alive']: return True if obs['step_count'] >= 800: return True return False @staticmethod def init_observation_space(env_config): """ observations for agents board: n^2 bomb blast strength: n^2 bomb life: n^2 """ board_size = env_config['board_size'] num_items = env_config['num_items'] board = spaces.Box(low=0, high=len(constants.Item), shape=(board_size, board_size)) bomb_blast_strength = spaces.Box(low=0, high=num_items, shape=(board_size, board_size)) bomb_life = spaces.Box(low=0, high=9, shape=(board_size, board_size)) flame_life = spaces.Box(low=0, high=3, shape=(board_size, board_size)) position = spaces.Box(low=0, high=board_size, shape=(2, )) blast_strength = spaces.Box(low=1, high=num_items, shape=(1, )) ammo = spaces.Box(low=0, high=num_items, shape=(1, )) return spaces.Dict({ "board": board, "bomb_blast_strength": bomb_blast_strength, "bomb_life": bomb_life, "flame_life": flame_life, "position": position, "ammo": ammo, "blast_strength": blast_strength }) @staticmethod def init_action_space(): return spaces.Discrete(6) @staticmethod def preproess(obs): del obs["game_type"] del obs["game_env"] del obs["can_kick"] del obs["teammate"] del obs["enemies"] del obs["step_count"] del obs['alive'] del obs['bomb_moving_direction'] obs['position'] = np.array(obs['position']) obs['ammo'] = np.array([obs['ammo']]) obs['blast_strength'] = np.array([obs['blast_strength']]) return obs def render(self): self.pomme.render()