Пример #1
def ffa_evaluate(env: Pomme, episodes, verbose, visualize, stop=False):
    Evaluates the given pommerman environment (already includes the agents).

    :param episodes: The number of episodes
    :param verbose: Whether to print verbose status information
    :param visualize: Whether to visualize the execution
    :param stop: Whether to wait for input after each step
    :return: The results of the evaluation of shape (episodes, 5) where the first column [:, 0] contains the result
             of the match (tie, win, incomplete) and the remaining columns contain the individual (final) rewards.

    # first element: result, additional elements: rewards
    steps = np.empty(episodes)
    results = np.empty((episodes, 1 + 4))

    start = time.time()

    # Run the episodes just like OpenAI Gym
    for i_episode in range(episodes):
        state = env.reset()
        done = False
        reward = []
        info = {}
        step = 0
        while not done:
            if visualize:
            actions = env.act(state)
            state, reward, done, info = env.step(actions)
            step += 1

            if stop:

        steps[i_episode] = step

        result = info['result']
        # save the result
        results[i_episode, 0] = result.value
        results[i_episode, 1:] = reward

        if verbose:
            delta = time.time() - start
            print('\r{:.2f} sec > Episode {} finished with {} ({})'.format(
                delta, i_episode, result, reward))

            if i_episode % 10 == 9 and i_episode != episodes - 1:
                ffa_print_stats(results, steps, i_episode + 1)


    if verbose:
        delta = time.time() - start
        print("Total time: {:.2f} sec".format(delta))
        ffa_print_stats(results, steps, episodes)

    return results
Пример #2
def main():
    # Print all possible environments in the Pommerman registry

    config = ffa_v1_env()
    env = Pomme(**config["env_kwargs"])

    # Add 3 agents
    agents = {}
    for agent_id in range(4):
        agents[agent_id] = SimpleAgent(config["agent"](agent_id,

    # agents[3] = PlayerAgent(config["agent"](agent_id, config["game_type"]), "arrows")


    demo = []

    # Run the episodes just like OpenAI Gym
    for i_episode in range(1):
        state = env.reset()
        done = False
        while not done:
            actions = env.act(state)
            state, reward, done, info = env.step(actions)
        if 1 in reward:
            winner = reward.index(1)
            winner = None

        print('Episode {} finished'.format(i_episode))

    # If game not tied, save demonstration
    if winner is not None:
        demonstration = {'demo': demo, 'winner': winner}
        pickle.dump(demonstration, open("demonstration.p", "wb"))
Пример #3
agents = {}
for agent_id in range(3):
    agents[agent_id] = StaticAgent(config["agent"](agent_id, config["game_type"]))

# Add human agent

agent_id += 1
agents[3] = PlayerAgent(config["agent"](agent_id, config["game_type"]), "arrows")


# Seed and reset the environment
obs = env.reset()

# Run the agents until we're done
done = False
while not done:
    actions = env.act(obs) # brauch ich nicht
    #actions = [action % 4 for action in actions]
    #actions = [0,actions[1]]
    obs, reward, done, info = env.step(actions)
    #kacka = featurize(obs[0])

# Print the result
class PomFFA(gym.Env):
    agent_list = [HoldAgent(), HoldAgent(), HoldAgent(), HoldAgent()]
    all_obs = None
    all_action = None
    cur_obs = None
    alive_agents = [10, 11, 12, 13]
    player_agent_id = 10

    def __init__(self, env_config=None):

        pomme_config = pommerman.configs.ffa_competition_env()

        if env_config:
            for k, v in env_config.items():
                if k in pomme_config['env_kwargs']:
                    pomme_config['env_kwargs'][k] = v

        print("pomme_config: ")

        self.pomme = Pomme(**pomme_config['env_kwargs'])

        self.observation_space = self.init_observation_space(
        self.action_space = self.pomme.action_space

        self.total_reward = 0
        self.prev_alive = 4
        self.visited = np.zeros(shape=(11, 11))

        if not env_config or (env_config
                              and env_config.get("is_training", True)):
            # initialize env twice could raise error here.

    def init(self, pomm_config):
        for id_, agent in enumerate(self.agent_list):
            assert isinstance(agent, agents.BaseAgent)
            print(id_, pomm_config['game_type'])
            agent.init_agent(id_, pomm_config['game_type'])

    def reset(self):
        obs = self.pomme.reset()
        self.all_obs = obs
        obs = self.get_for_training_agent(obs)
        self.cur_obs = obs
        obs = self.preproess(obs)
        self.total_reward = 0
        self.prev_alive = 4
        self.visited = np.zeros(shape=(11, 11))
        return obs

    def get_reward(self, obs, action, agent_id):
        if len(obs["alive"]) == 1:
            # An agent won. Give them +1, others -1.
            if agent_id in obs['alive']:
                return 1.0 - self.total_reward
                return -0.5

        if obs["step_count"] >= 500:
            # Game is over from time. Everyone gets -1.
            return -0.5

        # Game running: 0 for alive, -1 for dead.
        if agent_id not in obs['alive']:
            return -0.5

        x, y = obs["position"]
        blast = obs["bomb_blast_strength"]

        px = [1, -1, 0, 0]
        py = [0, 0, -1, 1]

        sum_reward = 0.0

        sum_reward += 20 * (len(obs["alive"]) - self.prev_alive)
        self.prev_alive = len(obs["alive"])

        if action == 0:
            sum_reward -= 0.1

        elif action == 5:
            # sum_reward += 1
            for i in range(4):
                tx = x + px[i]
                ty = y + py[i]
                if tx < 0 or tx > 10 or ty < 0 or ty > 10:
                if obs["board"][tx][ty] == 1:
                    sum_reward += 2
                elif obs["board"][tx][ty] > 10:
                    sum_reward += 4
            assert (1 <= action <= 4), str(action)
            dx = x + px[action - 1]
            dy = y + py[action - 1]
            if (not (dx < 0 or dx > 10 or dy < 0
                     or dy > 10)) and obs["board"][dx][dy] == 0:
                if self.visited[dx][dy] > 0:
                    sum_reward -= 0.1
                    sum_reward += 0.3
                    self.visited[dx][dy] = 1

        sum_reward = sum_reward * 1.0 / 100.0
        new_total_reward = self.total_reward + sum_reward
        if new_total_reward > 0.8 or new_total_reward < -0.5:
            sum_reward = 0.0
            self.total_reward = new_total_reward

        return sum_reward

    def step(self, action):
        actions = self.pomme.act(self.all_obs)
        if self.alive_agents and self.player_agent_id in self.alive_agents:
            actions = self.set_for_training_agent(actions, action)
            actions = self.set_for_training_agent(actions, 0)
        obs, rewards, done, info = self.pomme.step(actions)

        self.all_obs = obs
        obs = self.get_for_training_agent(obs)
        self.cur_obs = obs
        reward = self.get_reward(self.cur_obs, action, self.player_agent_id)
        self.alive_agents = obs['alive']
        if (self.player_agent_id
                not in self.alive_agents) or obs["step_count"] >= 500:
            done = True
        obs = self.preproess(obs)
        return obs, reward, done, {}

    def get_for_training_agent(self, inputs):
        order = self.player_agent_id - 10
        return inputs[order].copy()

    def set_for_training_agent(self, inputs, value):
        order = self.player_agent_id - 10
        inputs[order] = value
        return inputs

    def init_observation_space(self, env_config):
            observations for agents
            board: n^2
            bomb blast strength: n^2
            bomb life: n^2
        board_size = env_config['board_size'] or 11
        num_items = env_config['num_items'] or 11
        print("env config: {}".format(env_config))
        # board_size = 11

        board = spaces.Box(low=0,
                           shape=(board_size, board_size))
        danger = spaces.Box(low=0, high=20, shape=(board_size, board_size))
        bomb_blast_strength = spaces.Box(low=0,
                                         shape=(board_size, board_size))
        bomb_life = spaces.Box(low=0, high=9, shape=(board_size, board_size))
        flame_life = spaces.Box(low=0, high=10, shape=(board_size, board_size))
        position = spaces.Box(low=0, high=board_size, shape=(2, ))
        blast_strength = spaces.Box(low=1, high=num_items, shape=(1, ))
        ammo = spaces.Box(low=0, high=num_items, shape=(1, ))
        # return spaces.Dict({"board": board,
        #                     "bomb_blast_strength": bomb_blast_strength, "bomb_life": bomb_life,
        #                     "flame_life": flame_life,
        #                     "position": position, "ammo": ammo, "blast_strength": blast_strength})
        return spaces.Dict({
            "board": board,
            "bomb_blast_strength": bomb_blast_strength,
            "bomb_life": bomb_life,
            "flame_life": flame_life,
            "position": position,
            "ammo": ammo,
            "blast_strength": blast_strength,
            "danger": danger

    def preproess(obs):
        del obs["game_type"]
        del obs["game_env"]
        del obs["can_kick"]
        del obs["teammate"]
        del obs["enemies"]
        del obs["step_count"]
        del obs['alive']
        del obs['bomb_moving_direction']
        obs['position'] = np.array(obs['position'])
        obs['ammo'] = np.array([obs['ammo']])
        obs['blast_strength'] = np.array([obs['blast_strength']])

        board = obs['board']
        bomb_blast_strength = obs['bomb_blast_strength']
        bomb_life = obs['bomb_life']
        # flame_life = obs['flame_life']
        # position = obs['position']
        # ammo = obs['ammo']
        # blast_strength = obs['blast_strength']

        danger = np.ndarray(shape=(11, 11), dtype=int)

        for x in range(11):
            for y in range(11):
                danger[x][y] = 10
                if board[x][y] == 4:
                    board[x][y] = 0
                    danger[x][y] = 0
                elif board[x][y] == 3:
                    board[x][y] = 0
                elif board[x][y] == 10:
                    board[x][y] = 1
                elif board[x][y] > 10:
                    board[x][y] = 5
                elif 6 <= board[x][y] <= 8:
                    board[x][y] = 3
                elif board[x][y] == 1:
                    board[x][y] = 4

        for x in range(11):
            for y in range(11):
                if bomb_life[x][y] > 0:
                    strength = int(bomb_blast_strength[x][y] + 0.5)
                    for tx in range(max(0, x - strength + 1),
                                    min(11, x + strength)):
                        danger[tx][y] = min(danger[tx][y], bomb_life[x][y])
                    for ty in range(max(0, y - strength + 1),
                                    min(11, y + strength)):
                        danger[x][ty] = min(danger[x][ty], bomb_life[x][y])

        obs['danger'] = danger

        return obs

    def render(self):
Пример #5
class PomFFA(gym.Env):

    def __init__(self, env_config=None):

        self.agent_list = [HoldAgent(), agents.SimpleAgent(), HoldAgent(), HoldAgent()]
        # self.agent_list = [agents.SimpleAgent(), agents.SimpleAgent(), agents.SimpleAgent(), agents.RandomAgent()]
        self.all_obs = None
        self.all_action = None
        self.cur_obs = None
        self.alive_agents = [10, 11, 12, 13]
        self.player_agent_id = 10
        self.total_reward = 0

        pomme_config = pommerman.configs.ffa_competition_env()

        if env_config:
            for k, v in env_config.items():
                if k in pomme_config['env_kwargs']:
                    pomme_config['env_kwargs'][k] = v

        self.pomme = Pomme(**pomme_config['env_kwargs'])

        self.observation_space = self.init_observation_space(pomme_config['env_kwargs'])
        self.action_space = self.pomme.action_space

        if not env_config or (env_config and env_config.get("is_training", True)):
            # initialize env twice could raise error here.

    def init(self, pomm_config):
        for id_, agent in enumerate(self.agent_list):
            assert isinstance(agent, agents.BaseAgent)
            print(id_, pomm_config['game_type'])
            agent.init_agent(id_, pomm_config['game_type'])

    def reset(self):
        obs = self.pomme.reset()
        self.all_obs = obs.copy()
        obs = self.get_for_training_agent(obs)
        self.cur_obs = obs.copy()
        obs = self.preproess(obs)
        self.total_reward = 0
        return obs

    def get_reward(self, obs, action, agent_id):
        if len(obs["alive"]) == 1:
            # An agent won. Give them +1, others -1.
            if agent_id in obs['alive']:
                return 0.5
                return -0.5

        if obs["step_count"] >= 500:
            # Game is over from time. Everyone gets -1.
            return -0.5

        # Game running: 0 for alive, -1 for dead.
        if agent_id not in obs['alive']:
            return -0.5

        x, y = obs["position"]
        # blast = obs["bomb_blast_strength"]

        px = [0, 1, 0, -1]
        py = [1, 0, -1, 0]

        sum_reward = 0
        if action == 5:
            for i in range(4):
                tx = x+px[i]
                ty = y+py[i]
                if tx<0 or tx>10 or ty<0 or ty>10:
                if obs["board"][tx][ty] == 1:
                    sum_reward += 1
                elif obs["board"][tx][ty] > 10:
                    sum_reward += 4

        sum_reward = sum_reward*1.0/200.0
        new_total_reward = self.total_reward + sum_reward
        if new_total_reward > 0.5 or new_total_reward < -0.5:
            sum_reward = 0
            self.total_reward = new_total_reward

        return sum_reward

    def step(self, action):
        actions = self.pomme.act(self.all_obs)
        if self.alive_agents and self.player_agent_id in self.alive_agents:
            actions = self.set_for_training_agent(actions, action)
            actions = self.set_for_training_agent(actions, 0)
        obs, rewards, done, info = self.pomme.step(actions)

        # print(obs)
        del self.all_obs
        self.all_obs = obs.copy()
        obs = self.get_for_training_agent(obs)
        del self.cur_obs
        self.cur_obs = obs.copy()
        reward = self.get_reward(self.cur_obs, action, self.player_agent_id)
        self.alive_agents = obs['alive']

        if self.player_agent_id not in self.alive_agents or self.cur_obs["step_count"] >= 500:
            done = True
        obs = self.preproess(obs)

        return obs, reward, done, {}

    def get_for_training_agent(self, inputs):
        order = self.player_agent_id - 10
        return inputs[order]

    def set_for_training_agent(self, inputs, value):
        order = self.player_agent_id - 10
        inputs[order] = value
        return inputs

    def init_observation_space(self, env_config):
            observations for agents
            board: n^2
            bomb blast strength: n^2
            bomb life: n^2
        board_size = env_config['board_size']
        num_items = env_config['num_items']
        # print("env config: {}".format(env_config))
        # board_size = 11

        board = spaces.Box(low=0, high=len(constants.Item), shape=(board_size, board_size))
        bomb_blast_strength = spaces.Box(low=0, high=num_items, shape=(board_size, board_size))
        bomb_life = spaces.Box(low=0, high=9, shape=(board_size, board_size))
        flame_life = spaces.Box(low=0, high=3, shape=(board_size, board_size))
        position = spaces.Box(low=0, high=board_size, shape=(2,))
        blast_strength = spaces.Box(low=1, high=num_items, shape=(1,))
        ammo = spaces.Box(low=0, high=num_items, shape=(1,))
        return spaces.Dict({"board": board, "bomb_blast_strength": bomb_blast_strength, "bomb_life": bomb_life,
                            "flame_life": flame_life,
                            "position": position, "ammo": ammo, "blast_strength": blast_strength})

    def preproess(obs):
        del obs["game_type"]
        del obs["game_env"]
        del obs["can_kick"]
        del obs["teammate"]
        del obs["enemies"]
        del obs["step_count"]
        del obs['alive']
        del obs['bomb_moving_direction']
        obs['position'] = np.array(obs['position'])
        obs['ammo'] = np.array([obs['ammo']])
        obs['blast_strength'] = np.array([obs['blast_strength']])
        return obs

    def render(self):
Пример #6
class PomFFA(gym.Env):
    agent_list = [
    alive_agents = [10, 11, 12, 13]
    agent_id = 10
    ammo = 1
    blast_strength = 2
    state = {}

    def __init__(self, env_config={}):
        pomme_config = pommerman.configs.ffa_competition_env()
        self.reward = Reward(env_config.get("reward"))

        self.pomme = Pomme(**pomme_config['env_kwargs'])

        self.observation_space = self.init_observation_space(
        self.action_space = self.pomme.action_space

        if not env_config or (env_config
                              and env_config.get("is_training", True)):
            # initialize env twice could raise error here.

    def init(self, pomm_config):
        for id_, agent in enumerate(self.agent_list):
            assert isinstance(agent, agents.BaseAgent)
            agent.init_agent(id_, pomm_config['game_type'])

    def init_state(self):
        self.state['agent_id'] = self.agent_id
        self.state['alive'] = self.alive_agents
        self.state['visited'] = set()
        self.state['blast_strength'] = self.blast_strength
        self.state['ammo'] = self.ammo
        self.state["bombs"] = {}

    def reset(self):
        all_obs = self.pomme.reset()
        obs = self.get_for_training_agent(all_obs)

        self.state['prev_obs'] = copy.deepcopy(obs)
        self.state['all_obs'] = all_obs
        self.state['alive'] = obs['alive']

        obs = self.build_obs(obs, self.state)
        return obs

    def step(self, action):
        actions = self.pomme.act(self.state['all_obs'])
        actions = self.set_for_training_agent(actions, action)

        all_obs, _, _, _ = self.pomme.step(actions)
        obs = self.get_for_training_agent(all_obs)
        info = {'board': obs['board'], 'blast_strength': obs['blast_strength']}
        done = self.get_done(obs)
        reward, self.state = self.reward.get_reward(action, obs, self.state)

        self.state['prev_obs'] = copy.deepcopy(obs)
        self.state['all_obs'] = all_obs
        self.state['alive'] = obs['alive']
        self.state['blast_strength'] = obs['blast_strength']
        self.state['ammo'] = obs['ammo']

        obs = self.build_obs(obs, self.state)
        return obs, reward, done, info

    def get_for_training_agent(self, inputs):
        order = self.agent_id - 10
        return inputs[order]

    def set_for_training_agent(self, inputs, value):
        order = self.agent_id - 10
        inputs[order] = value
        return inputs

    def get_done(self, obs):
        if self.agent_id not in obs['alive']:
            return True
        if obs['step_count'] >= 800:
            return True
        return False

    def build_obs(self, obs, state):
        board = obs['board']
        bomb_blast_strength = obs['bomb_blast_strength']
        bomb_life = obs['bomb_life']
        flame_life = obs['flame_life']
        agent_id = state['agent_id']
        ammo = state['ammo']
        passage = np.zeros_like(board)
        wall = np.zeros_like(board)
        wood = np.zeros_like(board)
        bomb = np.zeros_like(board)
        bonus = np.zeros_like(board)
        me = np.zeros_like(board)
        enemy = np.zeros_like(board)
        for y in range(board.shape[0]):
            for x in range(board.shape[1]):
                v = board[y][x]
                if v == 0:
                    passage[y][x] = 1
                elif v == 1:
                    wall[y][x] = 1
                elif v == 2:
                    wood[y][x] = 1
                elif v == 3:
                    bomb = create_cross(bomb, (y, x),
                elif v == 4:
                elif v == 6 or v == 7:
                    bonus[y][x] = 1
                elif v >= 10:
                    if v == agent_id:
                        me[y][x] = 1
                        enemy[y][x] = 1
                    if bomb_blast_strength[y][x] > 0:
                        bomb = create_cross(bomb, (y, x),

        ammo = ammo * np.ones_like(board) / 12
        bomb_life /= 9
        flame_life /= 3
        board = np.transpose(
                passage, wall, wood, bomb, bonus, me, enemy, bomb_life,
                flame_life, ammo
            ]), [1, 2, 0])
        return board

    def init_observation_space(env_config):
            observations for agents
            board: n^2
            bomb blast strength: n^2
            bomb life: n^2
        board_size = env_config['board_size']
        num_items = env_config['num_items']

        board = spaces.Box(
            low=0, high=1,
            shape=(board_size, board_size,
                   10))  # passage,wall,wood,bomb,bonus,me,enemies
        bomb_life = spaces.Box(low=0, high=9, shape=(board_size, board_size))
        flame_life = spaces.Box(low=0, high=3, shape=(board_size, board_size))
        ammo = spaces.Box(low=0,
                          shape=(board_size, board_size))
        # return spaces.Dict({"board": board, "bomb_life": bomb_life, "flame_life": flame_life,"ammo": ammo})
        return board

    def init_action_space():
        return spaces.Discrete(6)

    def render(self):
Пример #7
class PomFFA(gym.Env):
    agent_list = [
    all_obs = None
    all_action = None
    cur_obs = None
    alive_agents = [10, 11, 12, 13]
    player_agent_id = 10

    def __init__(self, env_config=None):

        pomme_config = pommerman.configs.ffa_competition_env()

        if env_config:
            for k, v in env_config.items():
                if k in pomme_config['env_kwargs']:
                    pomme_config['env_kwargs'][k] = v

        print("pomme_config: ")

        self.pomme = Pomme(**pomme_config['env_kwargs'])

        self.observation_space = self.init_observation_space(
        self.action_space = self.pomme.action_space

        if not env_config or (env_config
                              and env_config.get("is_training", True)):
            # initialize env twice could raise error here.

    def init(self, pomm_config):
        for id_, agent in enumerate(self.agent_list):
            assert isinstance(agent, agents.BaseAgent)
            print(id_, pomm_config['game_type'])
            agent.init_agent(id_, pomm_config['game_type'])

    def reset(self):
        obs = self.pomme.reset()
        self.all_obs = obs
        obs = self.get_for_training_agent(obs)
        self.cur_obs = obs
        obs = self.preproess(obs)
        return obs

    def get_reward(self, obs, action, agent_id):
        if len(obs["alive"]) == 1:
            # An agent won. Give them +1, others -1.
            if agent_id in obs['alive']:
                return 1
                return -1

        if obs["step_count"] >= 500:
            # Game is over from time. Everyone gets -1.
            return -1

        # Game running: 0 for alive, -1 for dead.
        if agent_id not in obs['alive']:
            return -1
        # x, y = obs["position"]
        # blast = obs["bomb_blast_strength"]
        # for w in range(11):
        #     if blast[x][w] > int(math.fabs(w-y)):
        #         return -10
        #     if blast[w][y] > int(math.fabs((w-x))):
        #         return -10

        return 0

    def step(self, action):
        actions = self.pomme.act(self.all_obs)
        if self.alive_agents and self.player_agent_id in self.alive_agents:
            actions = self.set_for_training_agent(actions, action)
            actions = self.set_for_training_agent(actions, 0)
        obs, rewards, done, info = self.pomme.step(actions)

        # print(obs)

        self.all_obs = obs
        obs = self.get_for_training_agent(obs)
        self.cur_obs = obs
        reward = self.get_reward(self.cur_obs, action, self.player_agent_id)
        self.alive_agents = obs['alive']
        if (self.player_agent_id
                not in self.alive_agents) or obs["step_count"] >= 500:
            done = True
        obs = self.preproess(obs)
        return obs, reward, done, {}

    def get_for_training_agent(self, inputs):
        order = self.player_agent_id - 10
        return inputs[order]

    def set_for_training_agent(self, inputs, value):
        order = self.player_agent_id - 10
        inputs[order] = value
        return inputs

    def init_observation_space(self, env_config):
            observations for agents
            board: n^2
            bomb blast strength: n^2
            bomb life: n^2
        board_size = env_config['board_size'] or 11
        num_items = env_config['num_items'] or 11
        print("env config: {}".format(env_config))
        # board_size = 11

        board = spaces.Box(low=0,
                           shape=(board_size, board_size))
        bomb_blast_strength = spaces.Box(low=0,
                                         shape=(board_size, board_size))
        bomb_life = spaces.Box(low=0, high=9, shape=(board_size, board_size))
        flame_life = spaces.Box(low=0, high=3, shape=(board_size, board_size))
        position = spaces.Box(low=0, high=board_size, shape=(2, ))
        blast_strength = spaces.Box(low=1, high=num_items, shape=(1, ))
        ammo = spaces.Box(low=0, high=num_items, shape=(1, ))
        return spaces.Dict({
            "board": board,
            "bomb_blast_strength": bomb_blast_strength,
            "bomb_life": bomb_life,
            "flame_life": flame_life,
            "position": position,
            "ammo": ammo,
            "blast_strength": blast_strength

    def preproess(obs):
        del obs["game_type"]
        del obs["game_env"]
        del obs["can_kick"]
        del obs["teammate"]
        del obs["enemies"]
        del obs["step_count"]
        del obs['alive']
        del obs['bomb_moving_direction']
        obs['position'] = np.array(obs['position'])
        obs['ammo'] = np.array([obs['ammo']])
        obs['blast_strength'] = np.array([obs['blast_strength']])
        return obs

    def render(self):
Пример #8
class PomFFA(gym.Env):
    agent_list = [
    all_obs = None
    all_action = None
    pre_obs = None
    alive_agents = [10, 11, 12, 13]
    agent_id = 10
    state = {}

    def __init__(self, env_config=None):

        pomme_config = pommerman.configs.ffa_competition_env()

        if env_config:
            for k, v in env_config.items():
                if k in pomme_config['env_kwargs']:
                    pomme_config['env_kwargs'][k] = v
            self.reward = Reward(env_config.get("reward"))
            self.reward = Reward()

        print("Pommerman Config:", pomme_config['env_kwargs'])

        self.pomme = Pomme(**pomme_config['env_kwargs'])

        self.observation_space = self.init_observation_space(
        self.action_space = self.pomme.action_space

        if not env_config or (env_config
                              and env_config.get("is_training", True)):
            # initialize env twice could raise error here.

    def init(self, pomm_config):
        for id_, agent in enumerate(self.agent_list):
            assert isinstance(agent, agents.BaseAgent)
            agent.init_agent(id_, pomm_config['game_type'])

    def reset(self):
        obs = self.pomme.reset()
        self.all_obs = obs
        obs = self.get_for_training_agent(obs)
        state = {
            "prev_obs": None,
            "visited": set(),
            "agent_id": 10,
            "alive": [10, 11, 12, 13],
            "strength": 2,
            "ammo": 1,
            "bombs": {},
        state['prev_obs'] = copy.deepcopy(obs)
        state['position'] = obs['position']
        self.state = state
        obs = self.preproess(obs)
        return obs

    def step(self, action):
        actions = self.pomme.act(self.all_obs)
        actions = self.set_for_training_agent(actions, action)

        obs, rewards, _, _ = self.pomme.step(actions)
        self.all_obs = obs
        obs = self.get_for_training_agent(obs)
        reward, self.state = self.reward.get_reward(action, obs, self.state)
        done = self.get_done(obs)
        self.state['prev_obs'] = copy.deepcopy(obs)
        self.state['position'] = obs['position']
        obs = self.preproess(obs)

        return obs, reward, done, {}

    def get_for_training_agent(self, inputs):
        order = self.agent_id - 10
        return inputs[order]

    def set_for_training_agent(self, inputs, value):
        order = self.agent_id - 10
        inputs[order] = value
        return inputs

    def get_done(self, obs):
        if self.agent_id not in obs['alive']:
            return True
        if obs['step_count'] >= 800:
            return True
        return False

    def init_observation_space(env_config):
            observations for agents
            board: n^2
            bomb blast strength: n^2
            bomb life: n^2
        board_size = env_config['board_size']
        num_items = env_config['num_items']

        board = spaces.Box(low=0,
                           shape=(board_size, board_size))
        bomb_blast_strength = spaces.Box(low=0,
                                         shape=(board_size, board_size))
        bomb_life = spaces.Box(low=0, high=9, shape=(board_size, board_size))
        flame_life = spaces.Box(low=0, high=3, shape=(board_size, board_size))
        position = spaces.Box(low=0, high=board_size, shape=(2, ))
        blast_strength = spaces.Box(low=1, high=num_items, shape=(1, ))
        ammo = spaces.Box(low=0, high=num_items, shape=(1, ))
        return spaces.Dict({
            "board": board,
            "bomb_blast_strength": bomb_blast_strength,
            "bomb_life": bomb_life,
            "flame_life": flame_life,
            "position": position,
            "ammo": ammo,
            "blast_strength": blast_strength

    def init_action_space():
        return spaces.Discrete(6)

    def preproess(obs):
        del obs["game_type"]
        del obs["game_env"]
        del obs["can_kick"]
        del obs["teammate"]
        del obs["enemies"]
        del obs["step_count"]
        del obs['alive']
        del obs['bomb_moving_direction']

        obs['position'] = np.array(obs['position'])
        obs['ammo'] = np.array([obs['ammo']])
        obs['blast_strength'] = np.array([obs['blast_strength']])

        return obs

    def render(self):