Exemplo n.º 1
0
class BaseEnv(gym.Env, ABC):
    metadata = {
        'render.modes': ['human', 'rgb_array'],
        'video.frames_per_second': 3
    }
    reward_range = (-float('inf'), float('inf'))

    def __init__(self):
        self.viewer = None
        self.seed()

    @abstractmethod
    def step(self, action):
        pass

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    @abstractmethod
    def reset(self):
        pass

    @abstractmethod
    def get_image(self):
        pass

    def render(self, mode='rgb_array', max_width=20):
        img = self.get_image()
        img = np.asarray(img).astype(np.uint8)
        img_height, img_width = img.shape[:2]
        ratio = max_width / img_width
        img = Image.fromarray(img).resize(
            [int(ratio * img_width),
             int(ratio * img_height)])
        img = np.asarray(img)
        if mode == 'rgb_array':
            # img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            return img
        elif mode == 'human':
            from gym.envs.classic_control.rendering import SimpleImageViewer
            if self.viewer is None:
                self.viewer = SimpleImageViewer()
            self.viewer.imshow(img)

            return self.viewer.isopen

    def close(self):
        if self.viewer is not None:
            self.viewer.close()
            self.viewer = None
Exemplo n.º 2
0
class ToyboxBaseEnv(AtariEnv, ABC):
    metadata = {'render.modes': ['human']}

    def __init__(self,
                 toybox,
                 game,
                 frameskip=(2, 5),
                 repeat_action_probability=0.,
                 grayscale=True,
                 alpha=False,
                 actions=None):
        assert (toybox.rstate)
        self.toybox = toybox
        # This is a workaround for issues with Gym wrappers
        # resetting state prematurely
        self.cached_state = None
        self.score = self.toybox.get_score()
        self.viewer = None

        # Required for compatability with OpenAI Gym's Atari wrappers
        self.np_random = np_random
        self.ale = MockALE(toybox)
        utils.EzPickle.__init__(self, game, 'human', frameskip,
                                repeat_action_probability)

        # By default, we don't need actions passed in:
        if actions is None:
            actions = toybox.get_legal_action_set()
        assert (actions is not None)
        self._action_set = actions
        self._obs_type = 'image'
        self._rgba = 1 if grayscale else 4 if alpha else 3
        self._pixel_high = 255

        self._height = self.toybox.get_height()
        self._width = self.toybox.get_width()
        self._dim = (self._height, self._width, self._rgba
                     )  # * len(self.toybox.get_state()))

        self.reward_range = (0, float('inf'))
        self.action_space = spaces.Discrete(len(self._action_set))
        self.observation_space = spaces.Box(low=0,
                                            high=self._pixel_high,
                                            shape=self._dim,
                                            dtype='uint8')

    def seed(self, seed=None):
        """
        This is totally the implementation in AtariEnv in openai/gym.
        """
        self.np_random, seed1 = seeding.np_random(seed)
        # Derive a random seed. This gets passed as a uint, but gets
        # checked as an int elsewhere, so we need to keep it below
        # 2**31.
        # Toybox takes a uint seed, but we're copying the ALE seed for reasons above.
        # They're unclear who checks, so being safe here.
        seed2 = seeding.hash_seed(seed1 + 1) % 2**31
        self.toybox.set_seed(seed2)
        # Start a new game to ensure that the seed gets used!.
        self.toybox.new_game()
        return [seed1, seed2]

    # This is required to "trick" baselines into treating us as a regular Atari game
    # Implementation copied from baselines
    def get_action_meanings(self):
        #return [ACTION_MEANING[i] for i in self._action_set]
        return list(ACTION_MEANING.values())

    # From OpenAI Gym Baselines
    # https://github.com/openai/baselines/blob/master/baselines/common/atari_wrappers.py
    def _get_obs(self):
        return self.toybox.get_state()

    def step(self, action_index):
        obs = None
        reward = None
        done = False
        info = {}

        # Sometimes the action_index is a numpy integer...
        #print('Action index and type', action_index, type(action_index))
        assert (action_index < len(self._action_set))
        assert (type(self._action_set) == list)

        self.toybox.apply_ale_action(self._action_set[action_index])

        if self.ale.game_over():
            print('GAME OVER')
            info['cached_state'] = self.toybox.to_state_json()

        obs = self._get_obs()

        # Compute the reward from the current score and reset the current score.
        score = self.toybox.get_score()
        reward = max(score - self.score, 0)
        self.score = score

        # Check whether the episode is done
        # use "ale" semantics here
        done = self.ale.game_over()

        # Send back dignostic information
        info['lives'] = self.toybox.get_lives()
        #info['frame'] = frame
        info['score'] = 0 if done else self.score

        return obs, reward, done, info

    def reset(self):
        self.cached_state = self.toybox.to_state_json()
        self.toybox.new_game()
        self.score = self.toybox.get_score()
        obs = self._get_obs()
        return obs

    def render(self, mode='human', close=False):
        if mode == 'human':
            # the following is copied from gym's AtariEnv
            if self.viewer is None:
                from gym.envs.classic_control.rendering import SimpleImageViewer
                self.viewer = SimpleImageViewer()
            self.viewer.imshow(self.toybox.get_rgb_frame())
            return self.viewer.isopen
        elif mode == 'rgb_array':
            return self.toybox.get_rgb_frame()

    def close(self):
        if self.viewer is not None:
            self.viewer.close()
        del self.toybox
        self.toybox = None
Exemplo n.º 3
0
class BaseEnv(gym.Env, ABC):
    """Base class for all mazelab environments. 
    
    The subclass should implement at least the following:
    
    - :meth:`step`
    - :meth:`reset`
    - :meth:`get_image`

    """
    metadata = {
        'render.modes': ['human', 'rgb_array'],
        'video.frames_per_second': 3
    }

    def __init__(self, maze, motion):
        self.maze = maze
        self.motion = motion

        self.observation_space = spaces.Box(low=-np.inf,
                                            high=np.inf,
                                            shape=self.maze.size,
                                            dtype=np.float32)
        self.action_space = spaces.Discrete(self.motion.size)

        self.viewer = None

        self.seed()

    @abstractmethod
    def step(self, action):
        pass

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    @abstractmethod
    def reset(self):
        pass

    @abstractmethod
    def get_image(self):
        pass

    def render(self, mode='human', max_width=500):
        img = self.get_image()
        img = np.asarray(img).astype(np.uint8)
        img_height, img_width = img.shape[:2]
        ratio = max_width / img_width
        img = Image.fromarray(img).resize(
            [int(ratio * img_width),
             int(ratio * img_height)])
        img = np.asarray(img)
        if mode == 'rgb_array':
            return img
        elif mode == 'human':
            from gym.envs.classic_control.rendering import SimpleImageViewer
            if self.viewer is None:
                self.viewer = SimpleImageViewer()
            self.viewer.imshow(img)

            return self.viewer.isopen

    def close(self):
        if self.viewer is not None:
            self.viewer.close()
            self.viewer = None
Exemplo n.º 4
0
class RetroEnv(gym.Env):
    metadata = {
        'render.modes': ['human', 'rgb_array'],
        'video.frames_per_second': 60.0
    }

    def compute_step(self, image):
        reward = self.data.current_reward()
        done = self.data.is_done()
        return reward, done, self.data.lookup_all()

    def record_movie(self, path):
        self.movie = retro.Movie(path, True)
        self.movie.configure(self.gamename, self.em)
        if self.initial_state:
            self.movie.set_state(self.initial_state)

    def stop_record(self):
        self.movie_path = None
        self.movie_id = 0
        if self.movie:
            self.movie.close()
            self.movie = None

    def auto_record(self, path=None):
        if not path:
            path = os.getcwd()
        self.movie_path = path

    def __init__(self,
                 game,
                 state=retro.STATE_DEFAULT,
                 scenario=None,
                 info=None,
                 use_restricted_actions=retro.ACTIONS_FILTERED,
                 record=False):
        if not hasattr(self, 'spec'):
            self.spec = None
        self.img = None
        self.viewer = None
        self.gamename = game
        self.statename = state

        game_path = retro.get_game_path(game)
        rom_path = retro.get_romfile_path(game)
        metadata_path = os.path.join(game_path, 'metadata.json')

        if state == retro.STATE_NONE:
            self.initial_state = None
        elif state == retro.STATE_DEFAULT:
            self.initial_state = None
            try:
                with open(metadata_path) as f:
                    metadata = json.load(f)
                if 'default_state' in metadata:
                    with gzip.open(
                            os.path.join(game_path, metadata['default_state'])
                            + '.state', 'rb') as fh:
                        self.initial_state = fh.read()
            except (IOError, json.JSONDecodeError):
                pass
        else:
            if not state.endswith('.state'):
                state += '.state'

            with gzip.open(os.path.join(game_path, state), 'rb') as fh:
                self.initial_state = fh.read()

        self.data = GameData()

        if info is None:
            info = 'data'

        if info.endswith('.json'):
            # assume it's a path
            info_path = info
        else:
            info_path = os.path.join(game_path, info + '.json')

        if scenario is None:
            scenario = 'scenario'

        if scenario.endswith('.json'):
            # assume it's a path
            scenario_path = scenario
        else:
            scenario_path = os.path.join(game_path, scenario + '.json')

        system = retro.get_romfile_system(rom_path)

        # We can't have more than one emulator per process. Before creating an
        # emulator, ensure that unused ones are garbage-collected
        gc.collect()
        self.em = retro.RetroEmulator(rom_path)
        self.em.configure_data(self.data)
        self.em.step()
        img = self.em.get_screen()

        core = retro.get_system_info(system)
        self.BUTTONS = core['buttons']
        self.NUM_BUTTONS = len(self.BUTTONS)
        self.BUTTON_COMBOS = self.data.valid_actions()

        try:
            assert self.data.load(
                info_path,
                scenario_path), 'Failed to load info (%s) or scenario (%s)' % (
                    info_path, scenario_path)
        except Exception:
            del self.em
            raise

        if use_restricted_actions == retro.ACTIONS_DISCRETE:
            combos = 1
            for combo in self.BUTTON_COMBOS:
                combos *= len(combo)
            self.action_space = gym.spaces.Discrete(combos)
        elif use_restricted_actions == retro.ACTIONS_MULTI_DISCRETE:
            self.action_space = gym.spaces.MultiDiscrete([
                len(combos) if gym_version >= (0, 9, 6) else
                (0, len(combos) - 1) for combos in self.BUTTON_COMBOS
            ])
        else:
            self.action_space = gym.spaces.MultiBinary(self.NUM_BUTTONS)

        kwargs = {}
        if gym_version >= (0, 9, 6):
            kwargs['dtype'] = np.uint8
        self.observation_space = gym.spaces.Box(low=0,
                                                high=255,
                                                shape=img.shape,
                                                **kwargs)

        self.use_restricted_actions = use_restricted_actions
        self.movie = None
        self.movie_id = 0
        self.movie_path = None
        if record is True:
            self.auto_record()
        elif record is not False:
            self.auto_record(record)
        self.seed()
        if gym_version < (0, 9, 6):
            self._seed = self.seed
            self._step = self.step
            self._reset = self.reset
            self._render = self.render
            self._close = self.close

    def step(self, a):
        if self.img is None:
            raise RuntimeError('Please call env.reset() before env.step()')

        action = 0
        if self.use_restricted_actions == retro.ACTIONS_DISCRETE:
            for combo in self.BUTTON_COMBOS:
                current = a % len(combo)
                a //= len(combo)
                action |= combo[current]
        elif self.use_restricted_actions == retro.ACTIONS_MULTI_DISCRETE:
            for i in range(len(a)):
                buttons = self.BUTTON_COMBOS[i]
                action |= buttons[a[i]]
        else:
            for i in range(len(a)):
                action |= int(a[i]) << i
            if self.use_restricted_actions == retro.ACTIONS_FILTERED:
                action = self.data.filter_action(action)
        a = np.zeros([16], np.uint8)
        for i in range(16):
            a[i] = (action >> i) & 1
            if self.movie:
                self.movie.set_key(i, a[i])
        if self.movie:
            self.movie.step()
        self.em.set_button_mask(a)
        self.em.step()
        self.img = ob = self.em.get_screen()
        self.data.update_ram()
        rew, done, info = self.compute_step(ob)
        return ob, float(rew), bool(done), dict(info)

    def reset(self):
        if self.initial_state:
            self.em.set_state(self.initial_state)
        self.em.set_button_mask(np.zeros([16], np.uint8))
        self.em.step()
        if self.movie_path is not None:
            self.record_movie(
                os.path.join(
                    self.movie_path, '%s-%s-%04d.bk2' %
                    (self.gamename, self.statename, self.movie_id)))
            self.movie_id += 1
        if self.movie:
            self.movie.step()
        self.img = ob = self.em.get_screen()
        self.data.reset()
        self.data.update_ram()
        return ob

    def seed(self, seed=None):
        self.np_random, seed1 = seeding.np_random(seed)
        # Derive a random seed. This gets passed as a uint, but gets
        # checked as an int elsewhere, so we need to keep it below
        # 2**31.
        seed2 = seeding.hash_seed(seed1 + 1) % 2**31
        return [seed1, seed2]

    def render(self, mode='human', close=False):
        if close:
            if self.viewer:
                self.viewer.close()
            return
        if mode == "rgb_array":
            return self.em.get_screen() if self.img is None else self.img
        elif mode == "human":
            if self.viewer is None:
                from gym.envs.classic_control.rendering import SimpleImageViewer
                self.viewer = SimpleImageViewer()
            self.viewer.imshow(self.img)
            return self.viewer.isopen

    def close(self):
        if hasattr(self, 'em'):
            del self.em

        if self.viewer is not None:
            self.viewer.close()
            self.viewer = None
Exemplo n.º 5
0
class Water(gym.Env):
    metadata = {'render.modes': ['human', 'rgb_array'],
                'video.frames_per_second': 3}
    FIELD = [
        'M',  # 0 agent
        'S',  # 1 start
        'G',  # 2 goal
        'W',  # 3 water
        'N',  # 4 nothing
    ]
    # this is the restriction of over iteration
    MAX_STEPS = 5000

    def __init__(self):
        super().__init__()
        self.viewer = None
        self.radius = 5
        self.rotation = 10
        self.ellipce_r = 10
        self.ellipce_c = 12
        self.x_shape = 10 * self.radius
        self.y_shape = 10 * self.radius
        self.MAP_shape = (self.x_shape, self.y_shape)
        # set an action space
        self.action_space = gym.spaces.Discrete(4)
        self.observation_space = gym.spaces.Box(
            low=0,
            high=len(self.FIELD),
            shape=self.MAP_shape
        )
        nrows, ncols = self.MAP_shape
        reward_range = [-1., 1.]
        self.reset()

    def reset(self):
        self.map = self.ellipse_map
        nrows, ncols = self.MAP_shape
        self.pos = self.find_pos('S')[0]
        # self.goal = self.find_pos('G')[0]
        self.done = False
        self.reward = 0
        self.steps = 0
        self.visited = []
        return self.observe()
        
        # dipict the map
    def ellipse_map(self):
        self.x = np.ones((self.x_shape, self.y_shape), dtype=np.uint8)
        self.x[self.x == 1] = 4
        # Start
        self.x[(0, 0)] = 1
        self.x_a, self.y_a = ellipse(self.x_shape/2, self.y_shape/2, self.ellipce_r, self.ellipce_c, rotation=np.deg2rad(self.rotation))
        self.x[(self.x_a, self.y_a)] = 3
        return self.x

    def is_movable(self, pos):
         return ((0 <= pos[0] < self.x_shape) and (0 <= pos[1] < self.y_shape))

        # judge whether agent gets to the goal
    def is_goal(self, show=False):
        nrows, ncols = self.MAP_shape
        if self.pos[0] == nrows - 1 and self.pos[1] == ncols - 1:
            if show:
                print("Goal")
            return True
        else:
            return False

    def is_done(self, show=False):
        return (not self.is_movable) or self.is_goal(show) or self.steps > self.MAX_STEPS

    def observe(self):
        # to copy the map with the place of the agent
        observation = np.copy(self.map())
        observation[tuple(self.pos)] = self.FIELD.index('M')
        return observation

    def point_finder(self):
        flat_space = np.reshape(self.observe(), [-1, 1])
        #print(flat_space)
        point = np.where(flat_space == 0)
        return int(point[0])

    def trace(self):
        self.row, self.col = np.where(self.observe() == 0)
        self.visited.append((int(self.row), int(self.col)))
        return self.visited

    def get_reward(self, pos, moved):
        nrows, ncols = self.MAP_shape
        if moved:
            if self.map()[tuple(pos)] == self.FIELD.index('W'):
                self.reward -= 10
            elif self.map()[tuple(pos)] == self.FIELD.index('N'):
                self.reward -= 0.3
        else:
            self.reward -= 0.5
        # Goal
        if self.is_goal():
            self.reward += 15
        return self.reward

    def find_pos(self, field_type):
        return np.array([np.where(self.map() == self.FIELD.index(field_type))])

    def step(self, action):
        nrows, ncols = self.MAP_shape
        if action == 0:
            next_pos = [x + y for (x, y) in zip(self.pos, [0, 1])]
        elif action == 1:
            next_pos = [x + y for (x, y) in zip(self.pos, [-1, 0])]
        elif action == 2:
            next_pos = [x + y for (x, y) in zip(self.pos, [1, 0])]
        elif action == 3:
            next_pos = [x + y for (x, y) in zip(self.pos, [-1, 0])]

        if self.is_movable(next_pos):
            self.pos = next_pos
            moved = True
        else:
            moved = False
        reward = self.get_reward(self.pos, moved)
        observation = self.observe()
        trace = self.trace()
        state = self.point_finder()
        done = self.is_done(True)
        return trace, state, reward, observation, done

    def show(self):
        # plt.grid('on')
        ims = []
        nrows, ncols = self.MAP_shape
        ax = plt.gca()
        fig = plt.figure()
        ax.set_xticks(np.arange(0.5, nrows, 1))
        ax.set_yticks(np.arange(0.5, ncols, 1))
        ax.set_xticklabels([])
        ax.set_yticklabels([])
        canvas = np.copy(self.map())
        for row, col in self.visited:
            canvas[(row, col)] = self.FIELD.index('M')
            img1 = plt.imshow(canvas, interpolation="bilinear", cmap=cm.GnBu)
            ims.append([img1])
        img = plt.imshow(canvas, interpolation="bilinear", cmap=cm.GnBu, animated=True)
        ani = animation.ArtistAnimation(fig, ims, interval=100, blit=True, repeat_delay=1000)
        plt.show()
        return

    @abstractmethod
    def get_image(self):
        pass

    def render(self, mode='human', max_width=500):
        img = self.get_image()
        img = np.asarray(img).astype(np.uint8)
        img_height, img_width = img.shape[:2]
        ratio = max_width / img_width
        #img = Image.fromarray(img).resize([int(ratio * img_width), int(ratio * img_height)])
        img = np.asarray(img)
        if mode == 'rgb_array':
            return img
        elif mode == 'human':
            from gym.envs.classic_control.rendering import SimpleImageViewer
            if self.viewer is None:
                self.viewer = SimpleImageViewer()
            self.viewer.imshow(img)

            return self.viewer.isopen

    def close(self):
        if self.viewer is not None:
            self.viewer.close()
            self.viewer = None
Exemplo n.º 6
0
class PyColabEnv(gym.Env):

    metadata = {
        'render.modes': ['human', 'rgb_array'],
    }

    def __init__(self,
                 max_iterations,
                 obs_type,
                 default_reward,
                 action_space,
                 act_null_value=4,
                 delay=30,
                 resize_scale=8,
                 crop_window=[5, 5]):
        """Create an `PyColabEnv` adapter to a `pycolab` game as a `gym.Env`.

        You can access the `pycolab.Engine` instance with `env.current_game`.

        Args:
            max_iterations: maximum number of steps.
            obs_type: type of observation to return.
            default_reward: default reward if reward is None returned by the
                `pycolab` game.
            action_space: the action `Space` of the environment.
            delay: renderer delay.
            resize_scale: number of pixels per observation pixel.
                Used only by the renderer.
            crop_window: dimensions of observation cropping.
        """
        assert max_iterations > 0
        assert isinstance(default_reward, numbers.Number)

        self._max_iterations = max_iterations
        self._default_reward = default_reward

        # At this point, the game would only want to access the random
        # property, although it is set to None initially.
        self.np_random = None

        self._colors = self.make_colors()
        test_game = self.make_game()
        test_game.the_plot.info = {}
        observations, _, _ = test_game.its_showtime()
        layers = list(observations.layers.keys())
        not_ordered = list(set(layers) - set(test_game.z_order))
        self._render_order = list(reversed(not_ordered + test_game.z_order))

        # Create the observation space.
        self.obs_type = obs_type

        if self.obs_type == 'mask':
            self.observation_space = spaces.Box(
                0., 1., [len(self.state_layer_chars)] +
                crop_window)  # don't count empty space layer
        elif self.obs_type == 'rgb':
            self.observation_space = spaces.Box(
                0., 255.,
                [crop_window[0] * resize_scale, crop_window[1] * resize_scale
                 ] + [3])
        self.action_space = action_space
        self.act_null_value = act_null_value

        self.current_game = None
        self._croppers = []
        self._state = None

        self._last_uncropped_observations = None
        self._empty_uncropped_board = None
        self._last_cropped_observations = None
        self._empty_cropped_board = None

        self._last_reward = None
        self._game_over = False

        self.viewer = None
        self.resize_scale = resize_scale
        self.delay = delay

        # Metrics
        self.visitation_frequency = {char: 0 for char in self.objects}
        self.first_visit_time = {char: 500 for char in self.objects}

        # Heatmaps
        self.episodes = 0  # number of episodes run (to determine when to save heatmaps)
        self.heatmap_save_freq = 3  # save heatmaps every 3 episodes
        self.heatmap = np.ones(
            (5, 5))  # stores counts each episode (5x5 is a placeholder)

    def pycolab_init(self, logdir, log_heatmaps):
        self.log_heatmaps = log_heatmaps
        root_path = os.path.abspath(__file__).split('/')[1:]
        root_path = root_path[:root_path.index('curiosity_baselines') + 1]
        self.heatmap_path = '/' + '/'.join(root_path) + '/' + '/'.join(
            logdir.split('/')[1:]) + '/heatmaps'
        if os.path.isdir(self.heatmap_path) == False and log_heatmaps == True:
            os.makedirs(self.heatmap_path)

    @abc.abstractmethod
    def make_game(self):
        """Function that creates a new pycolab game.

        Returns:
            pycolab.Engine.
        """
        pass

    def make_colors(self):
        """Functions that returns colors.

        Returns:
            Dictionary mapping key name to `tuple(R, G, B)`.
        """

        return {
            'P': (255., 255., 255.),
            'a': (175., 255., 15.),
            'b': (21., 0., 255.),
            'c': (250., 0., 129.),
            'd': (0., 250., 71.),
            'e': (255., 0., 0.),
            'f': (252., 28., 3.),
            'g': (136., 3., 252.),
            'h': (20., 145., 60.),
            '#': (61., 61., 61.),
            '@': (255., 255., 0.),
            ' ': (0., 0., 0.)
        }

    def _paint_board(self, layers, cropped=False):
        """Method to privately paint layers to RGB.

        Args:
            layers: a dictionary mapping a character to the respective curtain.
            cropped: whether or not this is being called to paint cropped or
                     uncropped images.

        Returns:
            3D np.array (np.uint32) representing the RGB of the observation
                layers.
        """
        if not cropped:
            board_shape = self._last_uncropped_observations.board.shape
        else:
            board_shape = self._last_cropped_observations.board.shape

        board = np.zeros(list(board_shape) + [3], np.uint32)
        board_mask = np.zeros(list(board_shape) + [3], np.bool)

        for key in self._render_order:
            color = self._colors.get(key, (0, 0, 0))
            color = np.reshape(color, [1, 1, -1]).astype(np.uint32)

            # Broadcast the layer to [H, W, C].
            board_layer_mask = np.array(layers[key])[..., None]
            board_layer_mask = np.repeat(board_layer_mask, 3, axis=-1)

            # Update the board with the new layer.
            board = np.where(np.logical_not(board_mask),
                             board_layer_mask * color, board)

            # Update the mask.
            board_mask = np.logical_or(board_layer_mask, board_mask)
        return board

    def _update_for_game_step(self, observations, reward):
        """Update internal state with data from an environment interaction."""
        # disentangled one hot state

        if self.obs_type == 'mask':
            self._state = []
            for char in self.state_layer_chars:
                if char != ' ':
                    mask = observations.layers[char].astype(float)
                    if char in self.objects and 1. in mask:
                        self.visitation_frequency[char] += 1
                    self._state.append(mask)
            self._state = np.array(self._state)

        elif self.obs_type == 'rgb':
            rgb_img = self._paint_board(observations.layers,
                                        cropped=True).astype(float)
            self._state = self.resize(rgb_img)
            for char in self.state_layer_chars:
                if char != ' ':
                    mask = observations.layers[char].astype(float)
                    if char in self.objects and 1. in mask:
                        self.visitation_frequency[char] += 1

        # update heatmap metric
        if self.log_heatmaps == True:
            pr, pc = self.current_game.things['P'].position
            self.heatmap[pr, pc] += 1

        self._last_reward = reward if reward is not None else \
            self._default_reward

        self._game_over = self.current_game.game_over

        if self.current_game.the_plot.frame >= self._max_iterations:
            self._game_over = True

    def reset(self):
        """Start a new episode."""
        self.current_game = self.make_game()
        for cropper in self._croppers:
            cropper.set_engine(self.current_game)
        self._colors = self.make_colors()
        self.current_game.the_plot.info = {}
        self._game_over = None
        self._last_observations = None
        self._last_reward = None

        observations, reward, _ = self.current_game.its_showtime()
        self._last_uncropped_observations = observations
        self._empty_uncropped_board = np.zeros_like(
            self._last_uncropped_observations.board)
        if len(self._croppers) > 0:
            observations = [
                cropper.crop(observations) for cropper in self._croppers
            ][0]
            self._last_cropped_observations = observations
            self._empty_cropped_board = np.zeros_like(
                self._last_cropped_observations)

        # save and reset metrics
        self.visitation_frequency = {char: 0 for char in self.objects}
        if self.log_heatmaps == True and self.episodes % self.heatmap_save_freq == 0:
            np.save('{}/{}.npy'.format(self.heatmap_path, self.episodes),
                    self.heatmap)
            heatmap_normed = self.heatmap / np.linalg.norm(self.heatmap)
            plt.imsave('{}/{}.png'.format(self.heatmap_path, self.episodes),
                       heatmap_normed,
                       cmap='afmhot',
                       vmin=0.0,
                       vmax=1.0)
        self.episodes += 1
        self.heatmap = np.zeros(self._last_uncropped_observations.board.shape)

        # run update
        self._update_for_game_step(observations, reward)
        return self._state

    def step(self, action):
        """Apply action, step the world forward, and return observations.

        Args:
            action: the desired action to apply to the environment.

        Returns:
            state, reward, done, info.
        """
        if self.current_game is None:
            logger.warn("Episode has already ended, call `reset` instead..")
            self._state = None
            reward = self._last_reward
            done = self._game_over
            return self._state, reward, done, {}

        # Execute the action in pycolab.
        self.current_game.the_plot.info = {}
        observations, reward, _ = self.current_game.play(action)
        self._last_uncropped_observations = observations
        self._empty_uncropped_board = np.zeros_like(
            self._last_uncropped_observations.board)

        # Crop and update
        if len(self._croppers) > 0:
            observations = [
                cropper.crop(observations) for cropper in self._croppers
            ][0]
            self._last_cropped_observations = observations
            self._empty_cropped_board = np.zeros_like(
                self._last_cropped_observations.board)

        self._update_for_game_step(observations, reward)
        info = self.current_game.the_plot.info

        # Add custom metrics
        info['visitation_frequency'] = self.visitation_frequency
        info['first_time_visit'] = self.first_visit_time

        # Check the current status of the game.
        reward = self._last_reward
        done = self._game_over

        if self._game_over:
            self.current_game = None

        return self._state, reward, done, info

    def render(self, mode='rgb_array', close=False):
        """Render the board to an image viewer or an np.array.

        Args:
            mode: One of the following modes:
                - 'human': render to an image viewer.
                - 'rgb_array': render to an RGB np.array (np.uint8)

        Returns:
            3D np.array (np.uint8) or a `viewer.isopen`.
        """
        img = self._empty_uncropped_board
        if self._last_uncropped_observations:
            img = self._last_uncropped_observations.board
            layers = self._last_uncropped_observations.layers
            if self._colors:
                img = self._paint_board(layers, cropped=False)
            else:
                assert img is not None, '`board` must not be `None`.'

        img = self.resize(img)

        if mode == 'rgb_array':
            return img
        elif mode == 'human':
            if self.viewer is None:
                from gym.envs.classic_control.rendering import (
                    SimpleImageViewer)
                self.viewer = SimpleImageViewer()
            self.viewer.imshow(img)
            time.sleep(self.delay / 1e3)
            return self.viewer.isopen

    def resize(self, img):
        img = _repeat_axes(img, self.resize_scale, axis=[0, 1])
        if len(img.shape) != 3:
            img = np.repeat(img[..., None], 3, axis=-1)
        return img.astype(np.uint8)

    def seed(self, seed=None):
        """Seeds the environment.

        Args:
            seed: seed of the random engine.

        Returns:
            [seed].
        """
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def close(self):
        """Tears down the renderer."""
        if self.viewer:
            self.viewer.close()
            self.viewer = None
Exemplo n.º 7
0
class GridWorld(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self,
                 file_name="map1.txt",
                 fail_rate=0.0,
                 terminal_reward=1.0,
                 move_reward=0.0,
                 bump_reward=-0.5,
                 bomb_reward=-1.0):
        self.viewer = SimpleImageViewer()
        self.n = None
        self.m = None
        self.bombs = []
        self.walls = []
        self.goals = []
        self.start = None
        this_file_path = os.path.dirname(os.path.realpath(__file__))
        file_name = os.path.join(this_file_path, file_name)
        with open(file_name, "r") as f:
            for i, row in enumerate(f):
                row = row.rstrip('\r\n')
                if self.n is not None and len(row) != self.n:
                    raise ValueError(
                        "Map's rows are not of the same dimension...")
                self.n = len(row)
                for j, col in enumerate(row):
                    if col == "x" and self.start is None:
                        self.start = self.n * i + j
                    elif col == "x" and self.start is not None:
                        raise ValueError(
                            "There is more than one starting position in the map..."
                        )
                    elif col == "G":
                        self.goals.append(self.n * i + j)
                    elif col == "B":
                        self.bombs.append(self.n * i + j)
                    elif col == "1":
                        self.walls.append(self.n * i + j)
            self.m = i + 1
        if len(self.goals) == 0:
            raise ValueError("At least one goal needs to be specified...")
        self.n_states = self.n * self.m
        self.n_actions = 4
        self.fail_rate = fail_rate
        self.state = self.start
        self.terminal_reward = terminal_reward
        self.move_reward = move_reward
        self.bump_reward = bump_reward
        self.bomb_reward = bomb_reward
        self.action_space = spaces.Discrete(4)
        self.observation_space = spaces.Discrete(self.n_states)
        self.done = False

    def step(self, action):
        assert self.action_space.contains(action)
        if self.state in self.goals or np.random.rand() < self.fail_rate:
            return self.state, 0.0, self.done, None
        else:
            new_state = self.take_action(action)
            reward = self.get_reward(new_state)
            self.state = new_state
            return self.state, reward, self.done, None

    def reset(self):
        self.done = False
        self.state = self.start
        return self.state

    def render(self, mode='human', close=False):
        if close:
            if self.viewer is not None:
                self.viewer.close()
                self.viewer = None
            return
        if mode == 'human':
            grid = np.multiply(np.ones((self.n_states, 3), dtype=np.int8),
                               np.array([0, 255, 0], dtype=np.int8))
            for g in self.goals:
                grid[g] = np.array([255, 0, 0])
            for b in self.bombs:
                grid[b] = np.array([255, 255, 0])
            for w in self.walls:
                grid[w] = np.array([0, 0, 0])
            grid[self.state] = np.array([0, 0, 255])
            grid = grid.reshape(self.m, self.n, 3)
            self.viewer.imshow(grid)
            return self.viewer.isopen
        elif mode == "rgb_array":
            return grid
        else:
            return

    def take_action(self, action):
        row = self.state / self.n
        col = self.state % self.n
        if action == DOWN and (row + 1) * self.n + col not in self.walls:
            row = min(row + 1, self.m - 1)
        elif action == UP and (row - 1) * self.n + col not in self.walls:
            row = max(0, row - 1)
        elif action == RIGHT and row * self.n + col + 1 not in self.walls:
            col = min(col + 1, self.n - 1)
        elif action == LEFT and row * self.n + col - 1 not in self.walls:
            col = max(0, col - 1)
        new_state = row * self.n + col
        return new_state

    def get_reward(self, new_state):
        if new_state in self.goals:
            self.done = True
            return self.terminal_reward
        elif new_state in self.bombs:
            return self.bomb_reward
        elif new_state == self.state:
            return self.bump_reward
        return self.move_reward
class PyColabEnv(gym.Env):

    metadata = {
        'render.modes': ['human', 'rgb_array'],
    }

    def __init__(self,
                 max_iterations,
                 obs_type,
                 default_reward,
                 action_space,
                 act_null_value=4,
                 delay=30,
                 resize_scale=8,
                 crop_window=[5, 5],
                 visitable_states=0,
                 color_palette=0,
                 reward_switch=[],
                 reward_config=dict(),
                 switch_perturbations=[],
                 dimensions=(19, 19)):
        """Create an `PyColabEnv` adapter to a `pycolab` game as a `gym.Env`.

        You can access the `pycolab.Engine` instance with `env.current_game`.

        Args:
            max_iterations: maximum number of steps.
            obs_type: type of observation to return.
            default_reward: default reward if reward is None returned by the
                `pycolab` game.
            action_space: the action `Space` of the environment.
            delay: renderer delay.
            resize_scale: number of pixels per observation pixel.
                Used only by the renderer.
            crop_window: dimensions of observation cropping.
            visitable_states: number of states the agent can visit.
            color_palette: which color palette to use for objects.
            reward_switch: list of objects or coords if the reward function switches.
            reward_config: list of objects and their associated rewards.
            switch_perturbations: color perturbations if a background switch is applied.
            dimensions: dimensions of the game board
        """
        assert max_iterations > 0
        assert isinstance(default_reward, numbers.Number)

        self._max_iterations = max_iterations

        # Reward specs
        self._default_reward = default_reward
        self._switch = 0
        self._reward_switch = reward_switch
        self._reward_target = None
        self._switch_perturbations = switch_perturbations
        self._reward_config = reward_config

        # At this point, the game would only want to access the random
        # property, although it is set to None initially.
        self.np_random = None
        self._color_palette = color_palette
        self._colors = self.make_colors()
        test_game = self.make_game(reward_config=self._reward_config)
        test_game.the_plot.info = {}
        observations, _, _ = test_game.its_showtime()
        layers = list(observations.layers.keys())
        not_ordered = list(set(layers) - set(test_game.z_order))
        self._render_order = list(reversed(not_ordered + test_game.z_order))

        # Prepare observation space.
        self.obs_type = obs_type
        self.height, self.width = dimensions
        self.crop_window = crop_window
        self.action_space = action_space
        if self.obs_type == 'mask':
            self.observation_space = spaces.Box(
                0., 1., [len(self.state_layer_chars)] +
                self.crop_window)  # don't count empty space layer
        elif self.obs_type == 'rgb':
            self.observation_space = spaces.Box(
                0., 255.,
                [self.crop_window[0] * 17, self.crop_window[1] * 17] + [3])
        elif self.obs_type == 'rgb_full':
            if 84 % self.width == 0:
                self.observation_space = spaces.Box(0., 255., [84, 84] + [3])
            else:
                self.observation_space = spaces.Box(0., 255., [85, 85] + [3])
        self.act_null_value = act_null_value
        self.visitable_states = visitable_states

        self.current_game = None
        self._croppers = []
        self._state = None

        self._last_uncropped_observations = None
        self._empty_uncropped_board = None
        self._last_cropped_observations = None
        self._empty_cropped_board = None

        self._last_reward = None
        self._game_over = False

        self.viewer = None
        self.delay = delay

        # Metrics
        self.visitation_frequency = {char: 0 for char in self.objects}
        self.first_visit_time = {char: 500 for char in self.objects}
        self.visitation_entropy = 0
        self.num_obj_eps = {char: 0 for char in self.objects}
        self.coverage = 0

    def heatmap_init(self, logdir, log_heatmaps):
        self.episodes = 0  # number of episodes run (to determine when to save heatmaps)
        self.heatmap_save_freq = 3  # save heatmaps every 3 episodes
        self.heatmap = np.zeros(
            (5, 5))  # stores counts each episode (5x5 is a placeholder)
        self.log_heatmaps = log_heatmaps
        root_path = os.path.abspath(__file__).split('/')[1:]
        root_path = root_path[:root_path.index('curiosity_baselines') + 1]
        self.heatmap_path = '/' + '/'.join(root_path) + '/' + '/'.join(
            logdir.split('/')[1:]) + '/heatmaps'
        self.startup = True
        if os.path.isdir(self.heatmap_path) == False and log_heatmaps == True:
            os.makedirs(self.heatmap_path)
        elif os.path.isdir(self.heatmap_path) == True:
            heatmaps = os.listdir(self.heatmap_path)
            if len(heatmaps) != 0:
                sorted_images = sorted(heatmaps,
                                       key=lambda img: int(img.split('.')[0]))
                last_episode = int(sorted_images[-1].split('.')[0])
                self.episodes = last_episode

    def obs_init(self, resize_scale):
        self.resize_scale = resize_scale

    @abc.abstractmethod
    def make_game(self):
        """Function that creates a new pycolab game.

        Returns:
            pycolab.Engine.
        """
        pass

    def make_colors(self):
        """Functions that returns colors.

        Returns:
            Dictionary mapping key name to `tuple(R, G, B)`.
        """
        if self._color_palette == 0:
            return {
                'P': (255., 255., 255.),
                'a': (175., 255., 15.),
                'b': (21., 0., 255.),
                'c': (255., 0., 0.),
                'd': (19., 139., 67.),
                'e': (250., 0., 129.),
                'f': (114., 206., 227.),
                'g': (136., 3., 252.),
                'h': (245., 119., 34.),
                '#': (61., 61., 61.),
                '@': (90., 90., 90.),
                ' ': (0., 0., 0.),
                '.': (110., 35., 35.)
            }
        elif self._color_palette == 1:
            return {
                'P': (255., 255., 255.),
                'a': (136., 3., 252.),
                'b': (21., 0., 255.),
                'c': (255., 0., 0.),
                'd': (19., 139., 67.),
                'e': (150., 0., 129.),
                '#': (61., 61., 61.),
                '@': (90., 90., 90.),
                ' ': (0., 0., 0.),
                '.': (110., 35., 35.)
            }
        elif self._color_palette == 2:
            return {
                'P': (255., 255., 255.),
                'a': (255., 0., 0.),
                'b': (255., 0., 0.),
                'c': (255., 0., 0.),
                'd': (255., 0., 0.),
                'e': (255., 0., 0.),
                'f': (255., 0., 0.),
                'g': (255., 0., 0.),
                'h': (255., 0., 0.),
                '#': (61., 61., 61.),
                '@': (90., 90., 90.),
                ' ': (0., 0., 0.),
                '.': (110., 35., 35.)
            }
        elif self._color_palette == 3:
            return {
                'P': (255., 255., 255.),
                'a': (30., 60., 90.),
                'b': (90., 60., 30.),
                'c': (90., 30., 60.),
                'd': (10., 100., 70.),
                'e': (10., 10., 160.),
                'f': (25., 130., 25.),
                'g': (50., 40., 90.),
                'h': (130., 25., 25.),
                '#': (61., 61., 61.),
                '@': (90., 90., 90.),
                ' ': (0., 0., 0.),
                '.': (110., 35., 35.)
            }

    def _check_visit(self, char):
        """Private method to check if the player
        has visited "char". A visit is when the
        character is within the 5x5 tile window
        around the player.
        """
        pr, pc = self.current_game.things['P'].position
        cr, cc = self.current_game.things[char].position
        if (pr - 2) <= cr <= (pr + 2) and (pc - 2) <= cc <= (pc + 2):
            return True
        return False

    def _paint_board(self, layers, cropped=False):
        """Method to privately paint layers to RGB.

        Args:
            layers: a dictionary mapping a character to the respective curtain.
            cropped: whether or not this is being called to paint cropped or
                     uncropped images.

        Returns:
            3D np.array (np.uint32) representing the RGB of the observation
                layers.
        """
        if not cropped:
            board_shape = self._last_uncropped_observations.board.shape
        else:
            board_shape = self._last_cropped_observations.board.shape

        board = np.zeros(list(board_shape) + [3], np.uint32)
        board_mask = np.zeros(list(board_shape) + [3], np.bool)

        for key in self._render_order:

            color = self._colors.get(key, (0, 0, 0))
            color = np.reshape(color, [1, 1, -1]).astype(np.uint32)

            # Broadcast the layer to [H, W, C].
            board_layer_mask = np.array(layers[key])[..., None]
            board_layer_mask = np.repeat(board_layer_mask, 3, axis=-1)

            # @ correspond to white noise or changing background
            perturbation = np.zeros(board_layer_mask.shape)
            if key == '@':
                if len(self._reward_switch) > 0:
                    perturbation = self._switch_perturbations[self._switch]
                else:
                    h, w = board_layer_mask.shape[:2]
                    perturbation = np.random.randint(-15, 15, (h, w, 1))

            # Update the board with the new layer.
            board = np.where(np.logical_not(board_mask),
                             board_layer_mask * color + perturbation, board)

            # Update the mask.
            board_mask = np.logical_or(board_layer_mask, board_mask)
        return board

    def _update_for_game_step(self, observations, reward):
        """Update internal state with data from an environment interaction."""
        # disentangled one hot state
        if self.obs_type == 'mask':
            self._state = []
            for char in self.state_layer_chars:
                if char in self.objects:
                    mask = observations.layers[char].astype(float)
                    if char in self.objects and 1. in mask:
                        self.visitation_frequency[char] += 1
                    self._state.append(mask)
            self._state = np.array(self._state)

        elif 'rgb' in self.obs_type:
            if self.obs_type == 'rgb':
                rgb_img = self._paint_board(observations.layers,
                                            cropped=True).astype(float)
            elif self.obs_type == 'rgb_full':
                rgb_img = self._paint_board(observations.layers,
                                            cropped=False).astype(float)
            self._state = self.resize(rgb_img)
            for char in self.state_layer_chars:
                if char in self.objects:
                    mask = observations.layers[char].astype(float)
                    if self._check_visit(char):
                        self.visitation_frequency[char] += 1

        # update heatmap metric
        if self.log_heatmaps == True:
            pr, pc = self.current_game.things['P'].position
            self.heatmap[pr, pc] += 1
            self.visitation_entropy = entropy(self.heatmap.flatten(),
                                              base=self.visitable_states)
            self.coverage = np.count_nonzero(
                self.heatmap) / self.visitable_states

        # update reward
        self._last_reward = reward if reward is not None else self._default_reward

        self._game_over = self.current_game.game_over

        if self.current_game.the_plot.frame >= self._max_iterations:
            self._game_over = True

    def step(self, action):
        """Apply action, step the world forward, and return observations.

        Args:
            action: the desired action to apply to the environment.

        Returns:
            state, reward, done, info.
        """
        if self.current_game is None:
            logger.warn("Episode has already ended, call `reset` instead..")
            self._state = None
            reward = self._last_reward
            done = self._game_over
            return self._state, reward, done, {}

        # Execute the action in pycolab.
        self.current_game.the_plot.info = {}
        observations, reward, _ = self.current_game.play(action)
        self._last_uncropped_observations = observations
        self._empty_uncropped_board = np.zeros_like(
            self._last_uncropped_observations.board)

        # Crop and update
        if len(self._croppers) > 0:
            observations = [
                cropper.crop(observations) for cropper in self._croppers
            ][0]
            self._last_cropped_observations = observations
            self._empty_cropped_board = np.zeros_like(
                self._last_cropped_observations.board)

        self._update_for_game_step(observations, reward)
        info = self.current_game.the_plot.info

        # Add custom metrics
        info['visitation_frequency'] = self.visitation_frequency
        info['first_time_visit'] = self.first_visit_time
        info['visitation_entropy'] = self.visitation_entropy
        info['coverage'] = self.coverage
        info['episodes'] = self.episodes
        info['num_obj_eps'] = self.num_obj_eps
        for ob in self.objects:
            pushes = getattr(self.current_game.things[ob], 'pushes', None)
            if pushes is not None:
                info['controllable_interactions'] = pushes

        # Check the current status of the game.
        reward = self._last_reward
        done = self._game_over

        if self._game_over:
            self.current_game = None

        return self._state, reward, done, info

    def reset(self):
        """Start a new episode."""
        if len(self._reward_switch) > 0:
            self._switch = np.random.randint(len(self._reward_switch))
            self._reward_target = self._reward_switch[self._switch]
            self._reward_config = {char: 0.0 for char in self._reward_switch}
            self._reward_config[self._reward_switch[self._switch]] = 1.0
        self.current_game = self.make_game(reward_config=self._reward_config)
        for cropper in self._croppers:
            cropper.set_engine(self.current_game)
        self._colors = self.make_colors()
        self.current_game.the_plot.info = {}
        self._game_over = None
        self._last_observations = None
        self._last_reward = None

        observations, reward, _ = self.current_game.its_showtime()
        self._last_uncropped_observations = observations
        self._empty_uncropped_board = np.zeros_like(
            self._last_uncropped_observations.board)
        if len(self._croppers) > 0:
            observations = [
                cropper.crop(observations) for cropper in self._croppers
            ][0]
            self._last_cropped_observations = observations
            self._empty_cropped_board = np.zeros_like(
                self._last_cropped_observations)

        # save and reset metrics
        for char in self.objects:
            if self.visitation_frequency[char] > 0:
                self.num_obj_eps[char] += 1
        self.visitation_frequency = {char: 0 for char in self.objects}
        if self.log_heatmaps == True and self.episodes % self.heatmap_save_freq == 0 and self.startup == False:
            np.save('{}/{}.npy'.format(self.heatmap_path, self.episodes),
                    self.heatmap)
            heatmap_normed = self.heatmap / np.linalg.norm(
                self.heatmap) + 0.0000000000000000001
            plt.imsave('{}/{}.png'.format(self.heatmap_path, self.episodes),
                       heatmap_normed,
                       cmap='afmhot',
                       vmin=0.0,
                       vmax=1.0)
        self.episodes += 1
        self.startup = False
        self.heatmap = np.zeros(self._last_uncropped_observations.board.shape)

        # run update
        self._update_for_game_step(observations, reward)
        return self._state

    def render(self, mode='rgb_array', close=False):
        """Render the board to an image viewer or an np.array.

        Args:
            mode: One of the following modes:
                - 'human': render to an image viewer.
                - 'rgb_array': render to an RGB np.array (np.uint8)

        Returns:
            3D np.array (np.uint8) or a `viewer.isopen`.
        """
        img = self._empty_uncropped_board
        if self._last_uncropped_observations:
            img = self._last_uncropped_observations.board
            layers = self._last_uncropped_observations.layers
            if self._colors:
                img = self._paint_board(layers, cropped=False)
            else:
                assert img is not None, '`board` must not be `None`.'

        img = self.resize(img, scale=17)

        if mode == 'rgb_array':
            return img
        elif mode == 'human':
            if self.viewer is None:
                from gym.envs.classic_control.rendering import (
                    SimpleImageViewer)
                self.viewer = SimpleImageViewer()
            self.viewer.imshow(img)
            time.sleep(self.delay / 1e3)
            return self.viewer.isopen

    def resize(self, img, scale=None):
        if scale is None:
            img = _repeat_axes(img, self.resize_scale, axis=[0, 1])
        else:
            img = _repeat_axes(img, scale, axis=[0, 1])
        if len(img.shape) != 3:
            img = np.repeat(img[..., None], 3, axis=-1)
        return img.astype(np.uint8)

    def seed(self, seed=None):
        """Seeds the environment.

        Args:
            seed: seed of the random engine.

        Returns:
            [seed].
        """
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def close(self):
        """Tears down the renderer."""
        if self.viewer:
            self.viewer.close()
            self.viewer = None
Exemplo n.º 9
0
class KrazyGridWorld:
    def __init__(self,
                 screen_height,
                 grid_squares_per_row=10,
                 one_hot_obs=True,
                 seed=42,
                 task_seed=None,
                 init_pos_seed=None,
                 death_square_percentage=0.1,
                 ice_sq_perc=0.05,
                 num_goals=3,
                 min_goal_distance=2,
                 max_goal_distance=np.inf,
                 num_steps_before_energy_needed=11,
                 energy_replenish=8,
                 energy_sq_perc=0.05,
                 num_transporters=1,
                 sparse_rewards=True,
                 image_obs=True,
                 use_local_obs=False):

        if task_seed is None:
            task_seed = seed

        if init_pos_seed is None:
            init_pos_seed = seed

        self.init_pos_rng = np.random.RandomState(init_pos_seed)
        self.task_rng = np.random.RandomState(task_seed)
        random.seed(task_seed)

        self.one_hot_obs = one_hot_obs
        self.image_obs = image_obs
        self.use_local_obs = use_local_obs
        self.screen_dim = (screen_height, screen_height)  # width and height

        self.tile_types = TileTypes()
        self.agent = Agent(
            num_steps_until_energy_needed=num_steps_before_energy_needed,
            energy_replenish=energy_replenish)
        self.game_grid = GameGrid(grid_squares_per_row=grid_squares_per_row,
                                  tile_types=self.tile_types,
                                  agent=self.agent,
                                  task_rng=self.task_rng,
                                  death_sq_perc=death_square_percentage,
                                  energy_sq_perc=energy_sq_perc,
                                  ice_sq_perc=ice_sq_perc,
                                  num_goals=num_goals,
                                  min_goal_distance=min_goal_distance,
                                  max_goal_distance=max_goal_distance,
                                  num_transporters=num_transporters)

        self.num_goals_obtained = 0
        self.sparse_reward = sparse_rewards

        self.reset_task()

        self.simple_image_viewer = None
        self.last_im_obs = None

    def reset(self,
              reset_agent_start_pos=False,
              reset_board=False,
              reset_colors=False,
              reset_dynamics=False):
        self.agent.dead = False
        self.agent.agent_position = copy.deepcopy(
            self.agent.agent_position_init)
        self.agent.num_steps_until_energy_needed = copy.deepcopy(
            self.agent.energy_init)
        self.num_goals_obtained = 0
        self.game_grid.grid_np = copy.deepcopy(self.game_grid.game_grid_init)
        if reset_colors:
            self.tile_types.reset_colors()
        if reset_dynamics:
            self.agent.change_dynamics()
        if reset_board:
            self.reset_task()
        if reset_agent_start_pos:
            self.reset_agent_start_position()
        return self.get_obs()

    def reset_task(self):
        # reset the entire board and agent start position, generating a new MDP.
        self.game_grid.get_new_game_grid()
        self.reset_agent_start_position()

    def reset_agent_start_position(self):
        # keep the previous board but update the agents starting position.
        # keeps the previous MDP but samples x_0.
        new_start = self.game_grid.get_one_non_agent_square()
        self.agent.agent_position = new_start
        self.agent.agent_position_init = new_start

    def get_obs(self):
        if self.image_obs:
            return self.get_img_obs()
        else:
            return None

    def step(self, a, render=False):
        if self.agent.dead is False:
            proposed_step = self.agent.try_step(a)
            if self.game_grid.is_position_legal(proposed_step):
                self.agent.agent_position = proposed_step
            self.check_dead()
            self.check_at_goal()
            self.check_at_energy()
            self.check_at_transporter()

            #  this shit handles the ice squares
            while True:
                if self.check_at_ice_square() is False:
                    break
                else:
                    #  don't take energy for going over ice.
                    self.agent.num_steps_until_energy_needed += 1
                    proposed_step_nu = self.agent.try_step(a)
                    if self.game_grid.is_position_legal(proposed_step_nu):
                        self.step(a)
                    else:
                        break

            if self.agent.num_steps_until_energy_needed < 1:
                self.agent.dead = True

            if render:
                self.render()
        return self.get_obs(), self.get_reward(), self.agent.dead, dict()

    def check_dead(self):
        agent_pos = self.agent.agent_position
        game_grid = self.game_grid.grid_np
        if game_grid[agent_pos[0], agent_pos[1]] == self.tile_types.death:
            self.agent.dead = True

    def check_at_goal(self):
        if self.game_grid.grid_np[
                self.agent.agent_position[0],
                self.agent.agent_position[1]] == self.tile_types.goal:
            self.game_grid.grid_np[
                self.agent.agent_position[0],
                self.agent.agent_position[1]] = self.tile_types.normal
            self.num_goals_obtained += 1

    def check_at_energy(self):
        if self.game_grid.grid_np[
                self.agent.agent_position[0],
                self.agent.agent_position[1]] == self.tile_types.energy:
            self.game_grid.grid_np[
                self.agent.agent_position[0],
                self.agent.agent_position[1]] = self.tile_types.normal
            self.agent.give_energy()

    def check_at_transporter(self):
        transport_sq = None
        if self.game_grid.grid_np[
                self.agent.agent_position[0],
                self.agent.agent_position[1]] == self.tile_types.transporter:
            for tr in self.game_grid.transporters:
                if self.agent.agent_position[0] == tr[0][
                        0] and self.agent.agent_position[1] == tr[0][1]:
                    transport_sq = tr[1]
                elif self.agent.agent_position[0] == tr[1][
                        0] and self.agent.agent_position[1] == tr[1][1]:
                    transport_sq = tr[0]
            if transport_sq is not None:
                self.agent.agent_position = [transport_sq[0], transport_sq[1]]

    def check_at_ice_square(self):
        if self.game_grid.grid_np[
                self.agent.agent_position[0],
                self.agent.agent_position[1]] == self.tile_types.ice:
            return True
        return False

    def render(self):
        if self.simple_image_viewer is None:
            from gym.envs.classic_control.rendering import SimpleImageViewer
            self.simple_image_viewer = SimpleImageViewer()
        im_obs = self.get_img_obs()
        self.simple_image_viewer.imshow(im_obs)
        time.sleep(0.075)

    def get_state_obs(self):
        grid_np = copy.deepcopy(self.game_grid.grid_np)
        agent_p = self.agent.agent_position
        grid_np[agent_p[0], agent_p[1]] = self.tile_types.agent
        grid_np = grid_np.astype(np.uint8)
        #agent_p = np.array(self.agent.agent_position)
        if self.one_hot_obs:
            n_values = np.max(grid_np) + 1
            grid_np = np.eye(n_values)[grid_np]
            #agent_p_temp = np.zeros((self.game_grid.grid_squares_per_row, self.game_grid.grid_squares_per_row, 1))
            #agent_p_temp[agent_p[0], agent_p[1], :] = 1

        if self.use_local_obs:
            neighbors = []
            x, y = self.agent.agent_position
            for _i, _j in [(-1, -1), (0, -1), (1, -1), (1, 0), (1, 1), (0, 1),
                           (-1, 1), (-1, 0)]:
                i, j = (_i + x, _j + y)
                if 0 <= i < self.game_grid.grid_squares_per_row and 0 <= j < self.game_grid.grid_squares_per_row:
                    neighbors.append([j, i])
                else:
                    neighbors.append(None)

            grid_np = np.array(neighbors)

        return grid_np.flatten()

    def get_img_obs(self):
        grid_np = copy.deepcopy(self.game_grid.grid_np)
        grid_np[self.agent.agent_position[0],
                self.agent.agent_position[1]] = self.tile_types.agent
        fake_img = np.zeros((self.game_grid.grid_squares_per_row,
                             self.game_grid.grid_squares_per_row, 3))
        for i in range(len(self.tile_types.all_tt)):
            is_grid_sq_color_i = grid_np == self.tile_types.all_tt[i]
            one_idxs = is_grid_sq_color_i.astype(int)
            one_idxs = np.tile(np.expand_dims(one_idxs, -1), 3)
            one_idxs = one_idxs * np.array(self.tile_types.colors[i].value)
            fake_img += one_idxs

        if self.use_local_obs:
            neighbors = []
            x, y = self.agent.agent_position
            valid_idxs = np.zeros_like(fake_img)
            valid_idxs[x, y] = 1.0
            for _i, _j in [(-1, -1), (0, -1), (1, -1), (1, 0), (1, 1), (0, 1),
                           (-1, 1), (-1, 0)]:
                i, j = (_i + x, _j + y)
                if 0 <= i < self.game_grid.grid_squares_per_row and 0 <= j < self.game_grid.grid_squares_per_row:
                    #neighbors.append([j, i])
                    valid_idxs[i, j] = 1.0
                else:
                    neighbors.append(None)
            fake_img *= valid_idxs

        res = cv2.resize(fake_img,
                         dsize=(256, 256),
                         interpolation=cv2.INTER_NEAREST)
        res = res.astype(np.uint8)
        return res

    def get_reward(self):
        if self.sparse_reward:
            return 0 + self.num_goals_obtained
        else:
            rew = 0
            for goal in self.game_grid.goal_squares:
                dist_1 = abs(goal[0] - self.agent.agent_position[0])
                dist_2 = abs(goal[1] - self.agent.agent_position[1])
                rew = rew + dist_1 + dist_2
            rew = -1.0 * rew
            rew = rew + 3.0 * self.num_goals_obtained
            return rew

    def close(self):
        self.simple_image_viewer.close()
Exemplo n.º 10
0
class BattleshipEnv(gym.Env):

    reward_range = (-float('inf'), float('inf'))
    metadata = {
        'render.modes': ['human', 'rgb_array'],
        'video.frames_per_second': 3
    }

    #hold_out>0: run env while excluding boards in heldout/<rules>.npy.
    #hold_out<0: run env cycling through boards in heldout/<rules>.npy
    #hold_out=0: run env on whole distribution without holding out boards (not used in paper).
    def __init__(self, rules='chain', n_board=7, hold_out=0):

        self.viewer = None
        self.seed()
        action_converter = []
        for i in range(n_board):
            for j in range(n_board):
                action_converter.append((i, j))
        self.action_converter = np.asarray(action_converter)
        self.n_board = n_board
        self.hold_out = hold_out
        self.rules = rules

        if hold_out == -1:
            self.heldout = np.load('held_out/' + rules + '.npy')
            self.maze_idx = 0
            self.maze = np.reshape(self.heldout[self.maze_idx], (7, 7))
            if self.rules in ['all', 'chain', 'tree', 'loop']:
                start = np.load('held_out/' + self.rules +
                                '_starts.npy')[self.maze_idx]
            else:
                hit_idx = np.where(self.maze == 1)
                choice = np.random.choice(list(range(len(hit_idx[0]))), size=1)
                start = (hit_idx[0][choice], hit_idx[1][choice])

        else:
            if hold_out > 0:
                heldout = np.load('held_out/' + rules + '.npy')
                self.heldout = set([tuple(x) for x in heldout])

            gen = generate_grid(self.rules, n=self.n_board)
            if len(gen) == 2:
                grid, start = gen
            else:
                grid = gen
                hit_idx = np.where(grid == 1)
                choice = np.random.choice(list(range(len(hit_idx[0]))), size=1)
                start = (hit_idx[0][choice], hit_idx[1][choice])

            if hold_out > 0:
                while tuple(grid.flatten()) in self.heldout:
                    gen = generate_grid(self.rules, n=self.n_board)
                    if len(gen) == 2:
                        grid, start = gen
                    else:
                        grid = gen
                        hit_idx = np.where(grid == 1)
                        choice = np.random.choice(list(range(len(hit_idx[0]))),
                                                  size=1)
                        start = (hit_idx[0][choice], hit_idx[1][choice])
            self.maze = grid

        self.board = np.ones(self.maze.shape) * -1
        self.current_position = start
        self.board[self.current_position[0], self.current_position[1]] = 1
        self.num_hits = 0
        self.self_hits = {}

        self.observation_space = Box(low=-1,
                                     high=1,
                                     shape=(n_board * n_board +
                                            n_board * n_board + 1, ),
                                     dtype=np.float)
        self.action_space = Discrete(np.prod(self.maze.shape))
        self.nA = n_board * n_board

        self.prev_reward = 0
        self.prev_action = np.zeros((self.nA, ))

        self.valid_actions = [1 for _ in range(self.nA)]

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def step(self, action):
        prev_position = self.current_position
        self.current_position = self.action_converter[action]
        reward = 0

        if self.board[self.current_position[0],
                      self.current_position[1]] == -1:
            if self.maze[self.current_position[0],
                         self.current_position[1]] == 1:
                self.board[self.current_position[0],
                           self.current_position[1]] = 1
                self.num_hits += 1
                reward = 1
            else:
                self.board[self.current_position[0],
                           self.current_position[1]] = 0
                reward = -1
        else:
            reward = -2
            if (self.current_position[0],
                    self.current_position[1]) not in self.self_hits.keys():
                self.self_hits[(self.current_position[0],
                                self.current_position[1])] = 1
            else:
                self.self_hits[(self.current_position[0],
                                self.current_position[1])] += 1

        if self._is_goal():
            reward = +10
            done = True
            if self.hold_out == -1:
                self.maze_idx += 1
        else:
            done = False

        p_action = self.prev_action
        p_reward = self.prev_reward
        self.prev_action = np.zeros((self.nA, ))
        self.prev_action[action] = 1
        self.prev_reward = reward

        obs = self.board.flatten()

        obs_array = np.concatenate((obs, p_action, [p_reward]))

        return obs_array, reward, done, {}

    def _is_goal(self):
        return np.sum(self.board == 1) == np.sum(self.maze == 1)

    def get_image(self):
        img = np.empty((*self.board.shape, 3), dtype=np.uint8)
        for i in range(self.board.shape[0]):
            for j in range(self.board.shape[1]):
                if self.board[i, j] == -1:
                    img[i, j, :] = 255, 255, 255

                elif self.board[i, j] == 1:
                    img[i, j, :] = 255, 0, 0
                    if (i, j) in self.self_hits.keys():
                        if (255 - 10 * self.self_hits[(i, j)]) < 5:
                            img[i, j, :] = 0, 0, 0
                        else:
                            img[i, j, :] = (255 -
                                            10 * self.self_hits[(i, j)]), 0, 0
                else:
                    img[i, j, :] = 0, 0, 255
                    if (i, j) in self.self_hits.keys():
                        if (255 - 10 * self.self_hits[(i, j)]) < 5:
                            img[i, j, :] = 0, 0, 0
                        else:
                            img[i, j, :] = 0, 0, (255 -
                                                  10 * self.self_hits[(i, j)])

        return img

    def set_task(self, task):
        self.maze = task
        self.board = np.zeros(self.maze.shape)
        self.current_position = [
            np.random.choice(range(self.maze.shape[0])),
            np.random.choice(self.maze.shape[1])
        ]

        self.num_hits = 0
        self.self_hits = {}
        return self.board.flatten()

    def reset(self):
        if self.hold_out == -1:
            self.maze = np.reshape(
                self.heldout[self.maze_idx % len(self.heldout)], (7, 7))
            if self.rules in ['all', 'chain', 'tree', 'loop']:
                start = np.load('held_out/' + self.rules +
                                '_starts.npy')[self.maze_idx %
                                               len(self.heldout)]
            else:
                hit_idx = np.where(self.maze == 1)
                choice = np.random.choice(list(range(len(hit_idx[0]))), size=1)
                start = (hit_idx[0][choice], hit_idx[1][choice])
        else:
            gen = generate_grid(self.rules, n=self.n_board)
            if len(gen) == 2:
                grid, start = gen
            else:
                grid = gen
                hit_idx = np.where(grid == 1)
                choice = np.random.choice(list(range(len(hit_idx[0]))), size=1)
                start = (hit_idx[0][choice], hit_idx[1][choice])

            if self.hold_out > 0:
                while tuple(grid.flatten()) in self.heldout:
                    gen = generate_grid(self.rules, n=self.n_board)
                    if len(gen) == 2:
                        grid, start = gen
                    else:
                        grid = gen
                        hit_idx = np.where(grid == 1)
                        choice = np.random.choice(list(range(len(hit_idx[0]))),
                                                  size=1)
                        start = (hit_idx[0][choice], hit_idx[1][choice])
            self.maze = grid

        self.board = np.ones(self.maze.shape) * -1
        self.current_position = start
        self.board[self.current_position[0], self.current_position[1]] = 1

        self.num_hits = 0
        self.self_hits = {}
        obs = self.board.flatten()

        obs_array = np.concatenate((obs, self.prev_action, [self.prev_reward]))
        self.valid_actions = [1 for _ in range(self.nA)]
        return obs_array

    def render(self, mode='human', max_width=500):
        img = self.get_image()
        img = np.asarray(img).astype(np.uint8)
        img_height, img_width = img.shape[:2]
        ratio = max_width / img_width
        img = PILImage.fromarray(img).resize(
            [int(ratio * img_width),
             int(ratio * img_height)])
        img = np.asarray(img)
        if mode == 'rgb_array':
            return img
        elif mode == 'human':
            from gym.envs.classic_control.rendering import SimpleImageViewer
            if self.viewer is None:
                self.viewer = SimpleImageViewer()
            self.viewer.imshow(img)

            return self.viewer.isopen

    def close(self):
        if self.viewer is not None:
            self.viewer.close()
            self.viewer = None
Exemplo n.º 11
0
class TetrisEnv(gym.Env, gym.utils.EzPickle):
    """An environment for playing Tetris in OpenAI Gym."""

    # meta-data about the environment for OpenAI Gym utilities (like Monitor)
    metadata = {
        'render.modes': ['human', 'rgb_array'],
        'video.frames_per_second': 30,
    }

    def __init__(self, max_steps: int, random_state: int = None) -> None:
        """
        Initialize a new Tetris environment.

        Args:
            max_steps: the max number of steps per episode.
            random_state: the random seed to start the environment with

        Returns:
            None

        """
        gym.utils.EzPickle.__init__(self)
        self.max_steps = max_steps
        self.viewer = None
        self.step_number = 0
        # Setup the observation space as RGB game frames
        self.observation_space = gym.spaces.Box(low=0,
                                                high=255,
                                                shape=(SCREEN_HEIGHT,
                                                       SCREEN_WIDTH, 3),
                                                dtype=np.uint8)
        # Setup the action space, the game defines 12 legal actions
        self.action_space = gym.spaces.Discrete(12)
        # setup the game
        self.game = Tetris()
        self.seed(random_state)

    @property
    def screen(self) -> np.ndarray:
        """Return the screen of the game"""
        return self.game.screen

    def reset(self) -> np.ndarray:
        """Reset the emulator and return the initial state."""
        self.game.reset()
        # reset the step count
        self.step_number = 0
        # return the initial screen from the game
        return self.game.screen

    def step(self, action: int) -> tuple:
        """
        Take a step using the given action.

        Args:
            action: the discrete action to perform. will use the action in
                    `self.actions` indexed by this value

        Returns:
            a tuple of:
            -   the start as a result of the action
            -   the reward achieved by taking the action
            -   a flag denoting whether the episode has ended
            -   a dictionary of extra information

        """
        state, reward, done, info = self.game.step(action)
        self.step_number += 1
        # if this step has passed the max number, set the episode to done
        if self.step_number >= self.max_steps:
            done = True
        return state, reward, done, info

    def render(self, mode: str = 'human'):
        """
        Render the current screen using the given mode.

        Args:
            mode: the mode to render the screen using
                - 'human': render in a window using GTK
                - 'rgb_array': render in the back-end and return a matrix

        Returns:
            None if mode is 'human' or a matrix if mode is 'rgb_array'

        """
        # if the mode is RGB, return the screen as a NumPy array
        if mode == 'rgb_array':
            return self.game.screen
        # if the mode is human, create a viewer and display the screen
        elif mode == 'human':
            from pyglet.window import Window
            from gym.envs.classic_control.rendering import SimpleImageViewer
            if self.viewer is None:
                self.viewer = SimpleImageViewer()
                self.viewer.window = Window(
                    width=SCREEN_WIDTH,
                    height=SCREEN_HEIGHT,
                    caption=self.spec.id,
                )
            self.viewer.imshow(self.game.screen)
            return self.viewer.isopen
        # otherwise the render mode is not supported, raise an error
        else:
            raise ValueError('unsupported render mode: {}'.format(repr(mode)))

    def close(self) -> None:
        """Close the emulator."""
        # delete the existing game if there is one
        if isinstance(self.game, Tetris):
            del self.game
        if self.viewer is not None:
            self.viewer.close()
            del self.viewer

    def seed(self, random_state: int = None) -> list:
        """
        Set the seed for this env's random number generator(s).

        Args:
            random_state: the seed to set the random generator to

        Returns:
            A list of seeds used in this env's random number generators

        """
        random.seed(random_state)
        self.curr_seed = random_state

        return [self.curr_seed]

    def get_keys_to_action(self) -> dict:
        """Return the dictionary of keyboard keys to actions."""
        # Map of in game directives to their associated keyboard value
        down = ord('s')
        left = ord('a')
        right = ord('d')
        rot_l = ord('q')
        rot_r = ord('e')
        # A mapping of pressed key combinations to discrete actions
        keys_to_action = {
            (): 0,
            (left, ): 1,
            (right, ): 2,
            (down, ): 3,
            (rot_l, ): 4,
            (rot_r, ): 5,
            tuple(sorted((
                left,
                down,
            ))): 6,
            tuple(sorted((
                right,
                down,
            ))): 7,
            tuple(sorted((
                left,
                rot_l,
            ))): 8,
            tuple(sorted((
                right,
                rot_l,
            ))): 9,
            tuple(sorted((
                left,
                rot_r,
            ))): 10,
            tuple(sorted((
                right,
                rot_r,
            ))): 11,
        }

        return keys_to_action
Exemplo n.º 12
0
class PyColabEnv(gym.Env):

    metadata = {
        'render.modes': ['human', 'rgb_array'],
    }

    def __init__(self,
                 max_iterations,
                 default_reward,
                 action_space,
                 delay=30,
                 resize_scale=8):
        """Create an `PyColabEnv` adapter to a `pycolab` game as a `gym.Env`.

        You can access the `pycolab.Engine` instance with `env.current_game`.

        Args:
            max_iterations: maximum number of steps.
            default_reward: default reward if reward is None returned by the
                `pycolab` game.
            action_space: the action `Space` of the environment.
            delay: renderer delay.
            resize_scale: number of pixels per observation pixel.
                Used only by the renderer.
        """
        assert max_iterations > 0
        assert isinstance(default_reward, numbers.Number)

        self._max_iterations = max_iterations
        self._default_reward = default_reward

        # At this point, the game would only want to access the random
        # property, although it is set to None initially.
        self.np_random = None

        self._colors = self.make_colors()
        test_game = self.make_game()
        test_game.the_plot.info = {}
        observations, _, _ = test_game.its_showtime()
        layers = list(observations.layers.keys())
        not_ordered = list(set(layers) - set(test_game.z_order))
        self._render_order = list(reversed(not_ordered + test_game.z_order))

        # Create the observation space.
        observation_layers = list(set(layers))
        self._observation_order = sorted(observation_layers)
        channels = [3]
        channel_max = 255.
        channel_min = 0.
        self._game_shape = list(observations.board.shape) + channels
        self.observation_space = spaces.Box(
            low=np.full(self._game_shape, channel_min, np.float32),
            high=np.full(self._game_shape, channel_max, np.float32),
            dtype=np.float32)
        self.action_space = action_space

        self.current_game = None
        self._last_observations = None
        self._empty_board = None
        self._last_state = None
        self._last_reward = None
        self._game_over = False

        self.viewer = None
        self.resize_scale = resize_scale
        self.delay = delay

    @abc.abstractmethod
    def make_game(self):
        """Function that creates a new pycolab game.

        Returns:
            pycolab.Engine.
        """
        pass

    def make_colors(self):
        """Functions that returns colors.

        Returns:
            Dictionary mapping key name to `tuple(R, G, B)`.
        """
        return {}

    def _paint_board(self, layers):
        """Method to privately paint layers to RGB.

        Args:
            layers: a dictionary mapping a character to the respective curtain.

        Returns:
            3D np.array (np.uint32) representing the RGB of the observation
                layers.
        """
        board_shape = self._last_observations.board.shape
        board = np.zeros(list(board_shape) + [3], np.uint32)
        board_mask = np.zeros(list(board_shape) + [3], np.bool)

        for key in self._render_order:
            color = self._colors.get(key, (0, 0, 0))
            color = np.reshape(color, [1, 1, -1]).astype(np.uint32)

            # Broadcast the layer to [H, W, C].
            board_layer_mask = np.array(layers[key])[..., None]
            board_layer_mask = np.repeat(board_layer_mask, 3, axis=-1)

            # Update the board with the new layer.
            board = np.where(np.logical_not(board_mask),
                             board_layer_mask * color, board)

            # Update the mask.
            board_mask = np.logical_or(board_layer_mask, board_mask)
        return board

    def _update_for_game_step(self, observations, reward):
        """Update internal state with data from an environment interaction."""
        self._last_observations = observations
        self._empty_board = np.zeros_like(self._last_observations.board)
        self._last_state = self._paint_board(observations.layers).astype(
            np.float32)
        self._last_reward = reward if reward is not None else \
            self._default_reward
        self._game_over = self.current_game.game_over

        if self.current_game.the_plot.frame >= self._max_iterations:
            self._game_over = True

    def reset(self):
        """Start a new episode."""
        self.current_game = self.make_game()
        self._colors = self.make_colors()
        self.current_game.the_plot.info = {}
        self._game_over = None
        self._last_observations = None
        self._last_reward = None
        observations, reward, _ = self.current_game.its_showtime()
        self._update_for_game_step(observations, reward)
        return self._last_state

    def step(self, action):
        """Apply action, step the world forward, and return observations.

        Args:
            action: the desired action to apply to the environment.

        Returns:
            state, reward, done, info.
        """
        if self.current_game is None:
            logger.warn("Episode has already ended, call `reset` instead..")
            state = self._last_state
            reward = self._last_reward
            done = self._game_over
            return state, reward, done, {}

        # Execute the action in pycolab.
        self.current_game.the_plot.info = {}
        observations, reward, _ = self.current_game.play(action)
        self._update_for_game_step(observations, reward)
        info = self.current_game.the_plot.info

        # Check the current status of the game.
        state = self._last_state
        reward = self._last_reward
        done = self._game_over

        if self._game_over:
            self.current_game = None
        return state, reward, done, info

    def render(self, mode='human'):
        """Render the board to an image viewer or an np.array.

        Args:
            mode: One of the following modes:
                - 'human': render to an image viewer.
                - 'rgb_array': render to an RGB np.array (np.uint8)

        Returns:
            3D np.array (np.uint8) or a `viewer.isopen`.
        """
        img = self._empty_board
        if self._last_observations:
            img = self._last_observations.board
            layers = self._last_observations.layers
            if self._colors:
                img = self._paint_board(layers)
            else:
                assert img is not None, '`board` must not be `None`.'

        img = _repeat_axes(img, self.resize_scale, axis=[0, 1])
        if len(img.shape) != 3:
            img = np.repeat(img[..., None], 3, axis=-1)
        img = img.astype(np.uint8)

        if mode == 'rgb_array':
            return img
        elif mode == 'human':
            if self.viewer is None:
                from gym.envs.classic_control.rendering import (
                    SimpleImageViewer)
                self.viewer = SimpleImageViewer()
            self.viewer.imshow(img)
            time.sleep(self.delay / 1e3)
            return self.viewer.isopen

    def seed(self, seed=None):
        """Seeds the environment.

        Args:
            seed: seed of the random engine.

        Returns:
            [seed].
        """
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def close(self):
        """Tears down the renderer."""
        if self.viewer:
            self.viewer.close()
            self.viewer = None
class TrafficEnv(gym.Env):
    def __init__(self, nlanes, ncars, images=True, sh=50):
        self.ncars = ncars
        self.nlanes = nlanes
        self.images = images

        self.l_lims = np.array((0.0, 0.0))  # f, b
        self.h_lims = np.array((1.0, 1.0))  # f, b
        self.vva = 0.005
        self.vlims = (0.01, 0.02)

        self.rsx = SWD / SHD
        self.sh = sh
        self.sw = int(self.sh * self.rsx)

        self.action_space = spaces.Tuple(
            (spaces.Discrete(3), spaces.Box(self.l_lims, self.h_lims)))

        if self.images:
            self.observation_space = spaces.Box(0.0,
                                                1.0,
                                                shape=(self.sw, self.sh))
        else:
            obs_low = np.array((0.0, 0.0, -self.vva) +
                               (-self.rsx, -2.0, 0.0, -0.2) * (self.ncars - 1))

            obs_high = np.array((self.rsx, 0.2, self.vva) +
                                (self.rsx, 2.0, self.rsx, 0.2) *
                                (self.ncars - 1))
            self.observation_space = spaces.Box(obs_low, obs_high)

        self.lanes = (np.arange(self.nlanes) + 0.5) * (self.sw /
                                                       self.sh) / self.nlanes
        self.hlanes = (np.arange(self.nlanes + 1) + 0.0) * (
            (self.sw - 1) / self.sh) / self.nlanes

        self.viewer = None
        self.cars = []
        self.is_final = False
        self.lanev = None
        self.reward_func = None

        self.np_random = None
        self.seed()

    def seed(self, seed=None):
        if seed is not None:
            np.random.seed(seed)
        return [seed]

    def reset(self):
        self.is_final = False
        car0y = 0.5
        self.lanev = np.random.rand(
            self.nlanes) * (self.vlims[1] - self.vlims[0]) + self.vlims[0]
        self.cars = [
            Car(self.lanes, int(self.nlanes / 2), car0y,
                self.lanev[int(self.nlanes / 2)])
        ]
        for ii in range(1, self.ncars):
            self.cars.append(self.get_car())
        return self.road_img(car0y - 0.5, self.sw,
                             self.sh)[0] if self.images else self.get_state()

    def get_car(self, ymin=0, ymax=1, lane=None):
        c_lane = np.random.randint(self.nlanes) if lane is None else lane
        c = Car(self.lanes, c_lane,
                np.random.rand() * (ymax - ymin) + ymin, self.lanev[c_lane])
        while self.car_overlaps(c):
            c_lane = np.random.randint(self.nlanes) if lane is None else lane
            c = Car(self.lanes, c_lane,
                    np.random.rand() * (ymax - ymin) + ymin,
                    self.lanev[c_lane])
        return c

    def car_overlaps(self, c, margin=1.2):
        for ii in range(len(self.cars)):
            if (self.cars[ii].lane == c.lane) and \
                    (np.abs(self.cars[ii].py - c.py) < (margin*(self.cars[ii].sy + c.sy))):
                return True
        return False

    def step(self, action):
        assert self.action_space[0].contains(
            action[0]), 'Action {} is invalid.'.format(action[0])
        assert self.action_space[1].contains(
            action[1]), 'Action {} is invalid.'.format(action[1])

        if not self.is_final:
            if action[0] != 1:
                self.cars[0].va = (action[0] - 1) * self.vva
            for it in range(200):
                car0y = self.cars[0].py
                for c in self.cars[1:]:
                    c.step(0.0, 1.0)
                self.cars[0].step(action[1][0] * 0.0005,
                                  (1.0 - action[1][1]) * 0.0001 + 0.99989)

            car0y = self.cars[0].py
            for ii in range(1, len(self.cars)):
                if (self.cars[ii].py - car0y) > 1:
                    self.cars[ii] = self.get_car(car0y - 0.5, car0y - 1.0)
                elif (self.cars[ii].py - car0y) < -1:
                    self.cars[ii] = self.get_car(car0y + 0.5, car0y + 1.0)

            rimg_i, cimgs_i = self.road_img(car0y - 0.5, self.sw, self.sh)
            self.is_final = (self.num_collisions(cimgs_i) > 0.0) or \
                            ((self.cars[0].px-self.cars[0].sx) <= 0.0) or \
                            ((self.cars[0].px+self.cars[0].sx) >= self.cars[0].rsx)

            reward = 0.0 if self.reward_func is None else self.reward_func(
                self.cars[0].px / self.rsx, self.cars[0].v,
                self.cars[0].va != 0.0, self.is_final)
        else:
            rimg_i, cimgs_i = self.road_img(self.cars[0].py - 0.5, self.sw,
                                            self.sh)
            self.is_final = True
            reward = 0.0
        return rimg_i if self.images else self.get_state(), reward, int(
            self.is_final), {}

    def render(self, mode='human'):
        if self.viewer is None:
            self.viewer = SimpleImageViewer()
        rimg_i = self.road_img(self.cars[0].py - 0.5, SWD, SHD)[0]
        img = np.transpose(
            np.stack([((1.0 - rimg_i) * 255).astype(np.uint8)] * 3, axis=2),
            (1, 0, 2))[::-1, :, :]
        self.viewer.imshow(img)

    def close(self):
        if self.viewer is not None:
            self.viewer.close()
            self.viewer = None

    def car_imgs(self, y0, swi, shi):
        return np.transpose(
            np.array([c.render(y0, swi, shi) for c in self.cars]), (1, 2, 0))

    def road_img(self, y0, swi, shi):
        cimgs_i = self.car_imgs(y0, swi, shi)
        lane_line = 1.0 * (np.sin(
            ((np.arange(shi) / shi) + y0) * 20 * 2 * np.pi) > 0)
        road_img = np.sum(cimgs_i[:, :, 1:] * 0.25,
                          axis=2) + cimgs_i[:, :, 0] * 1.0
        for l in self.hlanes:
            road_img[int(l * shi), :] = np.maximum(road_img[int(l * shi), :],
                                                   lane_line * 0.75)
        return np.minimum(road_img, 1.0), cimgs_i

    def num_collisions(self, cimgs_i):
        return np.sum((np.sum(cimgs_i, axis=2) > 1.0).flatten())

    def get_state(self):
        p0y = self.cars[0].py
        p0x = self.cars[0].px
        v0y = self.cars[0].v
        return np.concatenate(
            (np.array([p0x, v0y, self.cars[0].va]), ) + tuple(
                np.array([c.px - p0x, c.py - p0y, c.px, c.v - v0y])
                for c in self.cars[1:]))
Exemplo n.º 14
0
class PursuersEvaders(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self,
                 file_name="mmap1.txt",
                 catch_level=2,
                 terminal_reward=10.0,
                 ontarget_reward=1.0,
                 move_reward=0.0,
                 bump_reward=-0.2):
        self.viewer = SimpleImageViewer()
        self.n = None
        self.m = None
        self.catch_level = catch_level
        self.walls = []
        self.init_evaders = []
        self.init_pursuers = []
        this_file_path = os.path.dirname(os.path.realpath(__file__))
        file_name = os.path.join(this_file_path, file_name)
        with open(file_name, "r") as f:
            for i, row in enumerate(f):
                row = row.rstrip('\r\n')
                if self.n is not None and len(row) != self.n:
                    raise ValueError(
                        "Map's rows are not of the same dimension...")
                self.n = len(row)
                for j, col in enumerate(row):
                    if col == "P":
                        self.init_pursuers.append(self.n * i + j)
                    elif col == "E":
                        self.init_evaders.append(self.n * i + j)
                    elif col == "1":
                        self.walls.append(self.n * i + j)
            self.m = i + 1
        if self.m < 3 or self.n < 3:
            raise ValueError("Map too small...")
        if len(self.init_pursuers) < self.catch_level:
            raise ValueError(
                "At least a sufficient number of pursuers needs to be specified..."
            )
        if len(self.init_evaders) == 0:
            raise ValueError("At least one evaders needs to be specified...")
        self.evaders = copy.copy(self.init_evaders)
        self.pursuers = copy.copy(self.init_pursuers)
        self.n_states = self.n * self.m
        self.n_actions = 5**len(self.init_pursuers)
        self.terminal_reward = terminal_reward
        self.ontarget_reward = ontarget_reward
        self.move_reward = move_reward
        self.bump_reward = bump_reward
        self.action_space = spaces.Box(0, 4, (len(self.init_pursuers), ))
        self.observation_space = spaces.Box(-1, 3, (3, 3))
        self.done = False

    def step(self, action):
        assert self.action_space.contains(action)
        if len(self.evaders) == 0:
            return self.build_observation(), 0.0, self.done, None
        else:
            new_state = self.take_action(action)
            reward = self.get_reward(new_state, action)
            self.pursuers = new_state
            self.take_evaders_action()
            return self.build_observation(), reward, self.done, None

    def reset(self):
        self.done = False
        self.evaders = copy.copy(self.init_evaders)
        self.pursuers = copy.copy(self.init_pursuers)
        return self.build_observation()

    def render(self, mode='human', close=False):
        if close:
            if self.viewer is not None:
                self.viewer.close()
                self.viewer = None
            return
        if mode == 'human':
            grid = np.multiply(np.ones((self.n_states, 3), dtype=np.int8),
                               np.array([0, 255, 0], dtype=np.int8))
            for e in self.evaders:
                grid[e] = np.array([255, 0, 0])
            for w in self.walls:
                grid[w] = np.array([0, 0, 0])
            for p in self.pursuers:
                grid[p] = np.array([0, 0, 255])
            grid = grid.reshape(self.m, self.n, 3)
            self.viewer.imshow(grid)
            return self.viewer.isopen
        elif mode == "rgb_array":
            return grid
        else:
            return

    def take_action(self, action):
        new_state = []
        for a, p in zip(action, self.pursuers):
            row = p / self.n
            col = p % self.n
            if a == DOWN and (row + 1) * self.n + col not in self.walls:
                row = min(row + 1, self.m - 1)
            elif a == UP and (row - 1) * self.n + col not in self.walls:
                row = max(0, row - 1)
            elif a == RIGHT and row * self.n + col + 1 not in self.walls:
                col = min(col + 1, self.n - 1)
            elif a == LEFT and row * self.n + col - 1 not in self.walls:
                col = max(0, col - 1)
            new_state.append(row * self.n + col)
        return new_state

    def take_evaders_action(self):
        new_goals = []
        for e in self.evaders:
            row = e / self.n
            col = e % self.n
            a = np.random.randn(0, 5)
            if a == DOWN and (row + 1) * self.n + col not in self.walls:
                row = min(row + 1, self.m - 1)
            elif a == UP and (row - 1) * self.n + col not in self.walls:
                row = max(0, row - 1)
            elif a == RIGHT and row * self.n + col + 1 not in self.walls:
                col = min(col + 1, self.n - 1)
            elif a == LEFT and row * self.n + col - 1 not in self.walls:
                col = max(0, col - 1)
            new_goals.append(row * self.n + col)
        self.evaders = new_goals

    def get_reward(self, new_state, action):
        reward = 0.0
        for i, p in enumerate(new_state):
            n = 1
            for x in new_state[i + 1:]:
                if x == p:
                    n += 1
            if n >= self.catch_level and p in self.evaders:
                reward += self.terminal_reward
                self.evaders.remove(p)
                if len(self.evaders) == 0:
                    self.done = True
            elif p in self.evaders:
                reward += self.ontarget_reward
            elif p == self.pursuers[i] and action[i] != NOOP:
                reward += self.bump_reward
            else:
                reward += self.move_reward
        return reward

    def build_observation(self):
        observations = []
        for p in self.pursuers:
            row = p / self.n
            col = p % self.n
            o = np.zeros((3, 3), dtype=np.int8)
            for i in range(-1, 2):
                for j in range(-1, 2):
                    if row + i < 0 or row + i >= self.m or col + j < 0 or col + j >= self.n:
                        o[i + 1][j + 1] = -1
                    else:
                        q = (row + i) * self.n + col + j
                        if q in self.walls:
                            o[i + 1][j + 1] = -1
                        elif q in self.evaders:
                            if q in self.pursuers:
                                o[i + 1][j + 1] = 3
                            else:
                                o[i + 1][j + 1] = 1
                        elif q in self.pursuers:
                            o[i + 1][j + 1] = 2
            o = o.tolist()
            for i, e in enumerate(o):
                o[i] = tuple(e)
            observations.append(tuple(o))
        return tuple(observations)
Exemplo n.º 15
0
class PyColabEnv(gym.Env):

    metadata = {
        'render.modes': ['human', 'rgb_array'],
    }

    def __init__(self,
                 max_iterations,
                 default_reward,
                 action_space,
                 act_null_value=4,
                 delay=30,
                 resize_scale=8,
                 crop_window=[5, 5],
                 render_mode='uncropped'):
        """Create an `PyColabEnv` adapter to a `pycolab` game as a `gym.Env`.

        You can access the `pycolab.Engine` instance with `env.current_game`.

        Args:
            max_iterations: maximum number of steps.
            default_reward: default reward if reward is None returned by the
                `pycolab` game.
            action_space: the action `Space` of the environment.
            delay: renderer delay.
            resize_scale: number of pixels per observation pixel.
                Used only by the renderer.
            crop_window: dimensions of observation cropping.
            render_mode: render board `cropped` or `uncropped`.
        """
        assert max_iterations > 0
        assert isinstance(default_reward, numbers.Number)

        self._max_iterations = max_iterations
        self._default_reward = default_reward

        # At this point, the game would only want to access the random
        # property, although it is set to None initially.
        self.np_random = None

        self._colors = self.make_colors()
        test_game = self.make_game()
        test_game.the_plot.info = {}
        observations, _, _ = test_game.its_showtime()
        layers = list(observations.layers.keys())
        not_ordered = list(set(layers) - set(test_game.z_order))
        self._render_order = list(reversed(not_ordered + test_game.z_order))

        # Create the observation space.
        observation_layers = list(set(layers))
        self._observation_order = sorted(observation_layers)
        self.observation_space = spaces.Box(0., 1., [len(self.state_layer_chars)] + crop_window) # don't count empty space layer
        self.action_space = action_space
        self.act_null_value = act_null_value

        self.current_game = None
        self._croppers = []
        self._state = None
        self._last_observations = None
        self._last_uncropped_observations = None
        self._empty_board = None
        self._empty_uncropped_board = None
        self._last_painted = None
        self._last_uncropped_painted = None
        self._last_reward = None
        self._game_over = False

        self.viewer = None
        self.resize_scale = resize_scale
        self.render_mode = render_mode
        self.delay = delay

        # Metrics
        self.visitation_frequency = {char:0 for char in self.objects}
        self.first_visit_time = {char:500 for char in self.objects}
        self.heat_map = None


    @abc.abstractmethod
    def make_game(self):
        """Function that creates a new pycolab game.

        Returns:
            pycolab.Engine.
        """
        pass

    def make_colors(self):
        """Functions that returns colors.

        Returns:
            Dictionary mapping key name to `tuple(R, G, B)`.
        """

        return {'P' : (255., 255., 255.),
                'a' : (175., 255., 15.),
                'b' : (21., 0., 255.),
                'c' : (0., 250., 71.),
                'd' : (250., 0., 129.),
                'e' : (255., 0., 0.),
                '#' : (61., 61., 61.),
                '@' : (255., 255., 0.),
                ' ' : (0., 0., 0.)}

    def _paint_board(self, layers):
        """Method to privately paint layers to RGB.

        Args:
            layers: a dictionary mapping a character to the respective curtain.

        Returns:
            3D np.array (np.uint32) representing the RGB of the observation
                layers.
        """
        if self.render_mode == 'uncropped':
            board_shape = self._last_uncropped_observations.board.shape
        elif self.render_mode == 'cropped':
            board_shape = self._last_observations.board.shape
        board = np.zeros(list(board_shape) + [3], np.uint32)
        board_mask = np.zeros(list(board_shape) + [3], np.bool)

        for key in self._render_order:
            color = self._colors.get(key, (0, 0, 0))
            color = np.reshape(color, [1, 1, -1]).astype(np.uint32)

            # Broadcast the layer to [H, W, C].
            board_layer_mask = np.array(layers[key])[..., None]
            board_layer_mask = np.repeat(board_layer_mask, 3, axis=-1)

            # Update the board with the new layer.
            board = np.where(
                np.logical_not(board_mask),
                board_layer_mask * color,
                board)

            # Update the mask.
            board_mask = np.logical_or(board_layer_mask, board_mask)
        return board

    def _update_for_game_step(self, observations, reward):
        """Update internal state with data from an environment interaction."""
        # disentangled one hot state

        # update heatmap
        r, c = self.current_game.__dict__['_sprites_and_drapes']['P'].position
        self.heat_map[r, c] += 1

        # update state
        self._state = []
        for char in self.state_layer_chars:
            if char != ' ':
                mask = observations.layers[char].astype(float)
                if char in self.objects and 1. in mask:
                    self.visitation_frequency[char] += 1
                self._state.append(mask)
        self._state = np.array(self._state)

        # rendering purposes (RGB)
        self._last_observations = observations
        if self.render_mode == 'cropped':
            self._empty_board = np.zeros_like(self._last_observations.board)
            self._last_painted = self._paint_board(observations.layers).astype(np.float32)

        self._last_reward = reward if reward is not None else \
            self._default_reward

        self._game_over = self.current_game.game_over

        if self.current_game.the_plot.frame >= self._max_iterations:
            self._game_over = True

    def reset(self):
        """Start a new episode."""
        self.current_game = self.make_game()
        for cropper in self._croppers:
            cropper.set_engine(self.current_game)
        self._colors = self.make_colors()
        self.current_game.the_plot.info = {}
        self._game_over = None
        self._last_observations = None
        self._last_reward = None
        observations, reward, _ = self.current_game.its_showtime()
        self._last_uncropped_observations = observations
        self._empty_uncropped_board = np.zeros_like(self._last_uncropped_observations.board)
        self._last_uncropped_painted = self._paint_board(observations.layers).astype(np.float32)
        if len(self._croppers) > 0:
            observations = [cropper.crop(observations) for cropper in self._croppers][0]

        self.heat_map = np.zeros((self.current_game.rows, self.current_game.cols))
        self._update_for_game_step(observations, reward)
        self.visitation_frequency = {char:0 for char in self.objects} # reset trackers
        return self._state

    def step(self, action):
        """Apply action, step the world forward, and return observations.

        Args:
            action: the desired action to apply to the environment.

        Returns:
            state, reward, done, info.
        """
        if self.current_game is None:
            logger.warn("Episode has already ended, call `reset` instead..")
            state = self._last_painted
            reward = self._last_reward
            done = self._game_over
            return state, reward, done, {}

        # Execute the action in pycolab.
        self.current_game.the_plot.info = {}
        observations, reward, _ = self.current_game.play(action)
        self._last_uncropped_observations = observations
        self._empty_uncropped_board = np.zeros_like(self._last_uncropped_observations.board)
        self._last_uncropped_painted = self._paint_board(observations.layers).astype(np.float32)

        # Crop and update
        if len(self._croppers) > 0:
            observations = [cropper.crop(observations) for cropper in self._croppers][0]
        self._update_for_game_step(observations, reward)
        info = self.current_game.the_plot.info

        # Add custom metrics
        info['visitation_frequency'] = self.visitation_frequency
        info['first_time_visit'] = self.first_visit_time
        info['heat_map'] = self.heat_map

        # Check the current status of the game.
        state = self._last_painted # for rendering
        reward = self._last_reward
        done = self._game_over

        if self._game_over:
            self.current_game = None

        return self._state, reward, done, info

    def render(self, mode='rgb_array', close=False):
        """Render the board to an image viewer or an np.array.

        Args:
            mode: One of the following modes:
                - 'human': render to an image viewer.
                - 'rgb_array': render to an RGB np.array (np.uint8)

        Returns:
            3D np.array (np.uint8) or a `viewer.isopen`.
        """
        if self.render_mode == 'cropped':
            img = self._empty_board
            if self._last_observations:
                img = self._last_observations.board
                layers = self._last_observations.layers
                if self._colors:
                    img = self._paint_board(layers)
                else:
                    assert img is not None, '`board` must not be `None`.'
        elif self.render_mode == 'uncropped':
            img = self._empty_uncropped_board
            if self._last_uncropped_observations:
                img = self._last_uncropped_observations.board
                layers = self._last_uncropped_observations.layers
                if self._colors:
                    img = self._paint_board(layers)
                else:
                    assert img is not None, '`board` must not be `None`.'

        img = _repeat_axes(img, self.resize_scale, axis=[0, 1])
        if len(img.shape) != 3:
            img = np.repeat(img[..., None], 3, axis=-1)
        img = img.astype(np.uint8)

        if mode == 'rgb_array':
            return img
        elif mode == 'human':
            if self.viewer is None:
                from gym.envs.classic_control.rendering import (
                    SimpleImageViewer)
                self.viewer = SimpleImageViewer()
            self.viewer.imshow(img)
            time.sleep(self.delay / 1e3)
            return self.viewer.isopen

    def seed(self, seed=None):
        """Seeds the environment.

        Args:
            seed: seed of the random engine.

        Returns:
            [seed].
        """
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def close(self):
        """Tears down the renderer."""
        if self.viewer:
            self.viewer.close()
            self.viewer = None
Exemplo n.º 16
0
class GridWorld(gym.Env):
    """Custom gridworld Environment that follows OpenAI Gym interface.

    Has height x width size and n_buttons of buttons
    Agent has to press them in ascending order
    """
    metadata = {'render.modes': ['human']}
    action2name = ['UP', 'DOWN', 'LEFT', 'RIGHT', 'PRESS']
    name2action = {k: v for v, k in enumerate(action2name)}
    action2delta = np.array([[-1, 0], [1, 0], [0, -1], [0, 1], [0, 0]], dtype=int)

    def __init__(self, height=5, width=5, n_buttons=3, button_pos=None, pixels_per_tile=10, seed=None,
                 obs_dtype='bool'):
        """
        :param height: height of the world (in tiles)
        :param width: width of the world (in tiles)
        :param n_buttons: number of buttons
        :param button_pos: (optional) list of (2,) numpy arrays - positions of the buttons
        :param pixels_per_tile: height/width of a tile for rendering
        :param seed: if specified, sets this seed to numpy random
        :param obs_dtype: 'bool' or 'int' observation format. 'bool' for agent + one-hot encoding of buttons,
            'int' for one integer for each tile
        """
        self.action_space = spaces.Discrete(5)
        if obs_dtype == 'bool':
            self.observation_space = spaces.Box(low=0, high=1, shape=(2 * n_buttons + 1, height, width), dtype=int)
        else:
            self.observation_space = spaces.Box(low=0, high=2 * (2 * n_buttons + 1), dtype=int)
        self.height = height
        self.width = width
        self.n_buttons = n_buttons
        self.button_pos = button_pos
        if seed is not None:
            np.random.seed(seed)
        if self.button_pos is None:
            self.button_pos = []
            idx = np.random.choice(height * width, n_buttons, replace=False)
            for index in idx:
                self.button_pos.append(np.array([index // width, index % width], dtype=int))
            self.button_idx = tuple(idx)
        else:
            self.button_idx = tuple(a * width + b for (a, b) in button_pos)
        if obs_dtype not in ['bool', 'int']:
            raise ValueError('obs_type should be "bool" or "int"')
        self.obs_dtype = obs_dtype

        self.next_button = None
        self.pos = None
        self.viewer = SimpleImageViewer()
        self.pixels_per_tile = pixels_per_tile

    def next_pos(self, pos, action):
        """
        Returns the next position of the agent
        :param pos: current position
        :param action: action number
        :return: next position
        """
        delta = self.action2delta[action]
        res = pos + delta
        res[0] = np.clip(res[0], 0, self.height - 1)
        res[1] = np.clip(res[1], 0, self.width - 1)
        return res

    def get_observation(self):
        """
        Returns an observation of the current environment state

        :return: if obs_type == 'bool': numpy array of shape (2 * n_buttons + 1, height, width) with values of 0 and 1
            if obs_type == 'int': numpy array of shape (height, width) with values in range(2 * (2 * n_buttons + 1))
        """
        if self.obs_dtype == 'bool':
            obs = np.zeros((2 * self.n_buttons + 1, self.height, self.width), dtype=int)
            h, w = self.pos
            # Agent position channel
            obs[0, h, w] = 1
            for ind, b_pos in enumerate(self.button_pos):
                h, w = b_pos
                if ind < self.next_button:
                    # Pressed
                    obs[2 * ind + 1, h, w] = 1
                else:
                    # Unpressed
                    obs[2 * ind + 2, h, w] = 1
            return obs
        if self.obs_dtype == 'int':
            obs = np.zeros((self.height, self.width), dtype=int)
            h, w = self.pos
            obs[h, w] = 2 * self.n_buttons + 1
            for ind, b_pos in enumerate(self.button_pos):
                h, w = b_pos
                if ind < self.next_button:
                    # Pressed
                    obs[h, w] += 2 * ind + 1
                else:
                    # Unpressed
                    obs[h, w] += 2 * ind + 2
            return obs

    def step(self, action):
        """
        Step function of the environment

        :param action: int from range(5)
        :return: observation - np.array (depends on obs_type)
            reward - float, 1. if the last button is pressed, else 0.
            done - bool, True if the last button is pressed
            info - information dict
        """
        if self.action2name[action] == 'PRESS':
            if np.all(self.pos == self.button_pos[self.next_button]):
                self.next_button += 1
        self.pos = self.next_pos(self.pos, action)
        obs = self.get_observation()
        done = (self.next_button == self.n_buttons)
        reward = float(done)
        info = self.get_info()
        return obs, reward, done, info

    def reset(self):
        """
        Resets the environment to the initial state

        :return: observation of the initial state
        """
        self.next_button = 0
        self.pos = np.array([0, 0], dtype=int)
        return self.get_observation()

    def render(self, mode='human', close=False):
        """Used for rendering of the environment"""
        if close:
            if self.viewer is not None:
                self.viewer.close()
                self.viewer = None
            return
        if mode == 'human':
            grid = grid_from_state_data(self.height, self.width, self.n_buttons, self.button_pos, self.pixels_per_tile,
                                        self.pos, self.next_button)
            self.viewer.imshow(grid)
            return self.viewer.isopen
        elif mode == 'get_grid':
            return grid_from_state_data(self.height, self.width, self.n_buttons, self.button_pos, self.pixels_per_tile,
                                        self.pos, self.next_button)
        else:
            return

    def close(self):
        pass

    def get_expert_action(self, eps=0.05):
        """
        Returns an action from (1 - eps) * optimal policy + eps * random policy

        :param eps: probability of a random action
        :return: action number from range(5)
        """
        if eps and np.random.rand() < eps:
            return int(np.random.randint(low=0, high=5))
        target = self.button_pos[self.next_button]
        if np.all(self.pos == target):
            return self.name2action['PRESS']
        vert = target[0] - self.pos[0]
        hor = target[1] - self.pos[1]
        if np.random.rand() < abs(vert) / (abs(vert) + abs(hor)):
            # go vertical
            if vert > 0:
                action = self.name2action['DOWN']
            else:
                action = self.name2action['UP']
        else:
            # go horisontal
            if hor > 0:
                action = self.name2action['RIGHT']
            else:
                action = self.name2action['LEFT']
        return action

    def get_info(self):
        """
        Information dict about the environment

        :return: dict with the 'state_tuple' key: encoding of the environment state (not used)
        """
        info = {'state_tuple': self.button_idx + (self.next_button,) + tuple(self.pos)}
        return info

    def to_random_state(self, seed=239):
        """
        Moves the environment to a random state (preserves button positions)

        :param seed: if specified, sets this seed to numpy random
        :return: observation of the new state
        """
        if seed:
            np.random.seed(seed)
        self.pos[0] = np.random.randint(0, self.height)
        self.pos[1] = np.random.randint(0, self.width)
        self.next_button = np.random.randint(0, self.n_buttons)
        return self.get_observation()

    def get_all_next_states_with_data(self):
        """
        Returns all of the possible next states and their 'state data'

        :return: states - observations of all states, accessible from the current one
            data - 'state data' of those states
        """
        backup = self.pos.copy(), self.next_button
        states = []
        data = []
        for next_button in range(self.next_button, self.n_buttons):
            for pos_h in range(self.height):
                for pos_w in range(self.width):
                    self.pos[0] = pos_h
                    self.pos[1] = pos_w
                    self.next_button = next_button
                    states.append(self.get_observation())
                    data.append(self.get_state_data())
        self.next_button = self.n_buttons
        self.pos[:] = self.button_pos[-1]
        states.append(self.get_observation())
        data.append(self.get_state_data())

        self.pos, self.next_button = backup
        return states, data

    def get_state_data(self):
        """
        Returns the 'state data' tuple

        :return: heights, width, number of buttons, button positions, pixels per tile, agent position, next button
            number - copied from the environment
        """
        return (
            copy(self.height), copy(self.width), copy(self.n_buttons), copy(self.button_pos),
            copy(self.pixels_per_tile), copy(self.pos), copy(self.next_button))

    def load_state_data(self, height, width, n_buttons, button_pos, pixels_per_tile, pos, next_button):
        """Loads environment data from the 'state data' format into the environment"""
        self.height = height
        self.width = width
        self.n_buttons = n_buttons
        self.button_pos = button_pos
        self.pixels_per_tile = pixels_per_tile
        self.pos = pos
        self.next_button = next_button