class BaseEnv(gym.Env, ABC): metadata = { 'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 3 } reward_range = (-float('inf'), float('inf')) def __init__(self): self.viewer = None self.seed() @abstractmethod def step(self, action): pass def seed(self, seed=None): self.np_random, seed = seeding.np_random(seed) return [seed] @abstractmethod def reset(self): pass @abstractmethod def get_image(self): pass def render(self, mode='rgb_array', max_width=20): img = self.get_image() img = np.asarray(img).astype(np.uint8) img_height, img_width = img.shape[:2] ratio = max_width / img_width img = Image.fromarray(img).resize( [int(ratio * img_width), int(ratio * img_height)]) img = np.asarray(img) if mode == 'rgb_array': # img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) return img elif mode == 'human': from gym.envs.classic_control.rendering import SimpleImageViewer if self.viewer is None: self.viewer = SimpleImageViewer() self.viewer.imshow(img) return self.viewer.isopen def close(self): if self.viewer is not None: self.viewer.close() self.viewer = None
class ToyboxBaseEnv(AtariEnv, ABC): metadata = {'render.modes': ['human']} def __init__(self, toybox, game, frameskip=(2, 5), repeat_action_probability=0., grayscale=True, alpha=False, actions=None): assert (toybox.rstate) self.toybox = toybox # This is a workaround for issues with Gym wrappers # resetting state prematurely self.cached_state = None self.score = self.toybox.get_score() self.viewer = None # Required for compatability with OpenAI Gym's Atari wrappers self.np_random = np_random self.ale = MockALE(toybox) utils.EzPickle.__init__(self, game, 'human', frameskip, repeat_action_probability) # By default, we don't need actions passed in: if actions is None: actions = toybox.get_legal_action_set() assert (actions is not None) self._action_set = actions self._obs_type = 'image' self._rgba = 1 if grayscale else 4 if alpha else 3 self._pixel_high = 255 self._height = self.toybox.get_height() self._width = self.toybox.get_width() self._dim = (self._height, self._width, self._rgba ) # * len(self.toybox.get_state())) self.reward_range = (0, float('inf')) self.action_space = spaces.Discrete(len(self._action_set)) self.observation_space = spaces.Box(low=0, high=self._pixel_high, shape=self._dim, dtype='uint8') def seed(self, seed=None): """ This is totally the implementation in AtariEnv in openai/gym. """ self.np_random, seed1 = seeding.np_random(seed) # Derive a random seed. This gets passed as a uint, but gets # checked as an int elsewhere, so we need to keep it below # 2**31. # Toybox takes a uint seed, but we're copying the ALE seed for reasons above. # They're unclear who checks, so being safe here. seed2 = seeding.hash_seed(seed1 + 1) % 2**31 self.toybox.set_seed(seed2) # Start a new game to ensure that the seed gets used!. self.toybox.new_game() return [seed1, seed2] # This is required to "trick" baselines into treating us as a regular Atari game # Implementation copied from baselines def get_action_meanings(self): #return [ACTION_MEANING[i] for i in self._action_set] return list(ACTION_MEANING.values()) # From OpenAI Gym Baselines # https://github.com/openai/baselines/blob/master/baselines/common/atari_wrappers.py def _get_obs(self): return self.toybox.get_state() def step(self, action_index): obs = None reward = None done = False info = {} # Sometimes the action_index is a numpy integer... #print('Action index and type', action_index, type(action_index)) assert (action_index < len(self._action_set)) assert (type(self._action_set) == list) self.toybox.apply_ale_action(self._action_set[action_index]) if self.ale.game_over(): print('GAME OVER') info['cached_state'] = self.toybox.to_state_json() obs = self._get_obs() # Compute the reward from the current score and reset the current score. score = self.toybox.get_score() reward = max(score - self.score, 0) self.score = score # Check whether the episode is done # use "ale" semantics here done = self.ale.game_over() # Send back dignostic information info['lives'] = self.toybox.get_lives() #info['frame'] = frame info['score'] = 0 if done else self.score return obs, reward, done, info def reset(self): self.cached_state = self.toybox.to_state_json() self.toybox.new_game() self.score = self.toybox.get_score() obs = self._get_obs() return obs def render(self, mode='human', close=False): if mode == 'human': # the following is copied from gym's AtariEnv if self.viewer is None: from gym.envs.classic_control.rendering import SimpleImageViewer self.viewer = SimpleImageViewer() self.viewer.imshow(self.toybox.get_rgb_frame()) return self.viewer.isopen elif mode == 'rgb_array': return self.toybox.get_rgb_frame() def close(self): if self.viewer is not None: self.viewer.close() del self.toybox self.toybox = None
class BaseEnv(gym.Env, ABC): """Base class for all mazelab environments. The subclass should implement at least the following: - :meth:`step` - :meth:`reset` - :meth:`get_image` """ metadata = { 'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 3 } def __init__(self, maze, motion): self.maze = maze self.motion = motion self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=self.maze.size, dtype=np.float32) self.action_space = spaces.Discrete(self.motion.size) self.viewer = None self.seed() @abstractmethod def step(self, action): pass def seed(self, seed=None): self.np_random, seed = seeding.np_random(seed) return [seed] @abstractmethod def reset(self): pass @abstractmethod def get_image(self): pass def render(self, mode='human', max_width=500): img = self.get_image() img = np.asarray(img).astype(np.uint8) img_height, img_width = img.shape[:2] ratio = max_width / img_width img = Image.fromarray(img).resize( [int(ratio * img_width), int(ratio * img_height)]) img = np.asarray(img) if mode == 'rgb_array': return img elif mode == 'human': from gym.envs.classic_control.rendering import SimpleImageViewer if self.viewer is None: self.viewer = SimpleImageViewer() self.viewer.imshow(img) return self.viewer.isopen def close(self): if self.viewer is not None: self.viewer.close() self.viewer = None
class RetroEnv(gym.Env): metadata = { 'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 60.0 } def compute_step(self, image): reward = self.data.current_reward() done = self.data.is_done() return reward, done, self.data.lookup_all() def record_movie(self, path): self.movie = retro.Movie(path, True) self.movie.configure(self.gamename, self.em) if self.initial_state: self.movie.set_state(self.initial_state) def stop_record(self): self.movie_path = None self.movie_id = 0 if self.movie: self.movie.close() self.movie = None def auto_record(self, path=None): if not path: path = os.getcwd() self.movie_path = path def __init__(self, game, state=retro.STATE_DEFAULT, scenario=None, info=None, use_restricted_actions=retro.ACTIONS_FILTERED, record=False): if not hasattr(self, 'spec'): self.spec = None self.img = None self.viewer = None self.gamename = game self.statename = state game_path = retro.get_game_path(game) rom_path = retro.get_romfile_path(game) metadata_path = os.path.join(game_path, 'metadata.json') if state == retro.STATE_NONE: self.initial_state = None elif state == retro.STATE_DEFAULT: self.initial_state = None try: with open(metadata_path) as f: metadata = json.load(f) if 'default_state' in metadata: with gzip.open( os.path.join(game_path, metadata['default_state']) + '.state', 'rb') as fh: self.initial_state = fh.read() except (IOError, json.JSONDecodeError): pass else: if not state.endswith('.state'): state += '.state' with gzip.open(os.path.join(game_path, state), 'rb') as fh: self.initial_state = fh.read() self.data = GameData() if info is None: info = 'data' if info.endswith('.json'): # assume it's a path info_path = info else: info_path = os.path.join(game_path, info + '.json') if scenario is None: scenario = 'scenario' if scenario.endswith('.json'): # assume it's a path scenario_path = scenario else: scenario_path = os.path.join(game_path, scenario + '.json') system = retro.get_romfile_system(rom_path) # We can't have more than one emulator per process. Before creating an # emulator, ensure that unused ones are garbage-collected gc.collect() self.em = retro.RetroEmulator(rom_path) self.em.configure_data(self.data) self.em.step() img = self.em.get_screen() core = retro.get_system_info(system) self.BUTTONS = core['buttons'] self.NUM_BUTTONS = len(self.BUTTONS) self.BUTTON_COMBOS = self.data.valid_actions() try: assert self.data.load( info_path, scenario_path), 'Failed to load info (%s) or scenario (%s)' % ( info_path, scenario_path) except Exception: del self.em raise if use_restricted_actions == retro.ACTIONS_DISCRETE: combos = 1 for combo in self.BUTTON_COMBOS: combos *= len(combo) self.action_space = gym.spaces.Discrete(combos) elif use_restricted_actions == retro.ACTIONS_MULTI_DISCRETE: self.action_space = gym.spaces.MultiDiscrete([ len(combos) if gym_version >= (0, 9, 6) else (0, len(combos) - 1) for combos in self.BUTTON_COMBOS ]) else: self.action_space = gym.spaces.MultiBinary(self.NUM_BUTTONS) kwargs = {} if gym_version >= (0, 9, 6): kwargs['dtype'] = np.uint8 self.observation_space = gym.spaces.Box(low=0, high=255, shape=img.shape, **kwargs) self.use_restricted_actions = use_restricted_actions self.movie = None self.movie_id = 0 self.movie_path = None if record is True: self.auto_record() elif record is not False: self.auto_record(record) self.seed() if gym_version < (0, 9, 6): self._seed = self.seed self._step = self.step self._reset = self.reset self._render = self.render self._close = self.close def step(self, a): if self.img is None: raise RuntimeError('Please call env.reset() before env.step()') action = 0 if self.use_restricted_actions == retro.ACTIONS_DISCRETE: for combo in self.BUTTON_COMBOS: current = a % len(combo) a //= len(combo) action |= combo[current] elif self.use_restricted_actions == retro.ACTIONS_MULTI_DISCRETE: for i in range(len(a)): buttons = self.BUTTON_COMBOS[i] action |= buttons[a[i]] else: for i in range(len(a)): action |= int(a[i]) << i if self.use_restricted_actions == retro.ACTIONS_FILTERED: action = self.data.filter_action(action) a = np.zeros([16], np.uint8) for i in range(16): a[i] = (action >> i) & 1 if self.movie: self.movie.set_key(i, a[i]) if self.movie: self.movie.step() self.em.set_button_mask(a) self.em.step() self.img = ob = self.em.get_screen() self.data.update_ram() rew, done, info = self.compute_step(ob) return ob, float(rew), bool(done), dict(info) def reset(self): if self.initial_state: self.em.set_state(self.initial_state) self.em.set_button_mask(np.zeros([16], np.uint8)) self.em.step() if self.movie_path is not None: self.record_movie( os.path.join( self.movie_path, '%s-%s-%04d.bk2' % (self.gamename, self.statename, self.movie_id))) self.movie_id += 1 if self.movie: self.movie.step() self.img = ob = self.em.get_screen() self.data.reset() self.data.update_ram() return ob def seed(self, seed=None): self.np_random, seed1 = seeding.np_random(seed) # Derive a random seed. This gets passed as a uint, but gets # checked as an int elsewhere, so we need to keep it below # 2**31. seed2 = seeding.hash_seed(seed1 + 1) % 2**31 return [seed1, seed2] def render(self, mode='human', close=False): if close: if self.viewer: self.viewer.close() return if mode == "rgb_array": return self.em.get_screen() if self.img is None else self.img elif mode == "human": if self.viewer is None: from gym.envs.classic_control.rendering import SimpleImageViewer self.viewer = SimpleImageViewer() self.viewer.imshow(self.img) return self.viewer.isopen def close(self): if hasattr(self, 'em'): del self.em if self.viewer is not None: self.viewer.close() self.viewer = None
class Water(gym.Env): metadata = {'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 3} FIELD = [ 'M', # 0 agent 'S', # 1 start 'G', # 2 goal 'W', # 3 water 'N', # 4 nothing ] # this is the restriction of over iteration MAX_STEPS = 5000 def __init__(self): super().__init__() self.viewer = None self.radius = 5 self.rotation = 10 self.ellipce_r = 10 self.ellipce_c = 12 self.x_shape = 10 * self.radius self.y_shape = 10 * self.radius self.MAP_shape = (self.x_shape, self.y_shape) # set an action space self.action_space = gym.spaces.Discrete(4) self.observation_space = gym.spaces.Box( low=0, high=len(self.FIELD), shape=self.MAP_shape ) nrows, ncols = self.MAP_shape reward_range = [-1., 1.] self.reset() def reset(self): self.map = self.ellipse_map nrows, ncols = self.MAP_shape self.pos = self.find_pos('S')[0] # self.goal = self.find_pos('G')[0] self.done = False self.reward = 0 self.steps = 0 self.visited = [] return self.observe() # dipict the map def ellipse_map(self): self.x = np.ones((self.x_shape, self.y_shape), dtype=np.uint8) self.x[self.x == 1] = 4 # Start self.x[(0, 0)] = 1 self.x_a, self.y_a = ellipse(self.x_shape/2, self.y_shape/2, self.ellipce_r, self.ellipce_c, rotation=np.deg2rad(self.rotation)) self.x[(self.x_a, self.y_a)] = 3 return self.x def is_movable(self, pos): return ((0 <= pos[0] < self.x_shape) and (0 <= pos[1] < self.y_shape)) # judge whether agent gets to the goal def is_goal(self, show=False): nrows, ncols = self.MAP_shape if self.pos[0] == nrows - 1 and self.pos[1] == ncols - 1: if show: print("Goal") return True else: return False def is_done(self, show=False): return (not self.is_movable) or self.is_goal(show) or self.steps > self.MAX_STEPS def observe(self): # to copy the map with the place of the agent observation = np.copy(self.map()) observation[tuple(self.pos)] = self.FIELD.index('M') return observation def point_finder(self): flat_space = np.reshape(self.observe(), [-1, 1]) #print(flat_space) point = np.where(flat_space == 0) return int(point[0]) def trace(self): self.row, self.col = np.where(self.observe() == 0) self.visited.append((int(self.row), int(self.col))) return self.visited def get_reward(self, pos, moved): nrows, ncols = self.MAP_shape if moved: if self.map()[tuple(pos)] == self.FIELD.index('W'): self.reward -= 10 elif self.map()[tuple(pos)] == self.FIELD.index('N'): self.reward -= 0.3 else: self.reward -= 0.5 # Goal if self.is_goal(): self.reward += 15 return self.reward def find_pos(self, field_type): return np.array([np.where(self.map() == self.FIELD.index(field_type))]) def step(self, action): nrows, ncols = self.MAP_shape if action == 0: next_pos = [x + y for (x, y) in zip(self.pos, [0, 1])] elif action == 1: next_pos = [x + y for (x, y) in zip(self.pos, [-1, 0])] elif action == 2: next_pos = [x + y for (x, y) in zip(self.pos, [1, 0])] elif action == 3: next_pos = [x + y for (x, y) in zip(self.pos, [-1, 0])] if self.is_movable(next_pos): self.pos = next_pos moved = True else: moved = False reward = self.get_reward(self.pos, moved) observation = self.observe() trace = self.trace() state = self.point_finder() done = self.is_done(True) return trace, state, reward, observation, done def show(self): # plt.grid('on') ims = [] nrows, ncols = self.MAP_shape ax = plt.gca() fig = plt.figure() ax.set_xticks(np.arange(0.5, nrows, 1)) ax.set_yticks(np.arange(0.5, ncols, 1)) ax.set_xticklabels([]) ax.set_yticklabels([]) canvas = np.copy(self.map()) for row, col in self.visited: canvas[(row, col)] = self.FIELD.index('M') img1 = plt.imshow(canvas, interpolation="bilinear", cmap=cm.GnBu) ims.append([img1]) img = plt.imshow(canvas, interpolation="bilinear", cmap=cm.GnBu, animated=True) ani = animation.ArtistAnimation(fig, ims, interval=100, blit=True, repeat_delay=1000) plt.show() return @abstractmethod def get_image(self): pass def render(self, mode='human', max_width=500): img = self.get_image() img = np.asarray(img).astype(np.uint8) img_height, img_width = img.shape[:2] ratio = max_width / img_width #img = Image.fromarray(img).resize([int(ratio * img_width), int(ratio * img_height)]) img = np.asarray(img) if mode == 'rgb_array': return img elif mode == 'human': from gym.envs.classic_control.rendering import SimpleImageViewer if self.viewer is None: self.viewer = SimpleImageViewer() self.viewer.imshow(img) return self.viewer.isopen def close(self): if self.viewer is not None: self.viewer.close() self.viewer = None
class PyColabEnv(gym.Env): metadata = { 'render.modes': ['human', 'rgb_array'], } def __init__(self, max_iterations, obs_type, default_reward, action_space, act_null_value=4, delay=30, resize_scale=8, crop_window=[5, 5]): """Create an `PyColabEnv` adapter to a `pycolab` game as a `gym.Env`. You can access the `pycolab.Engine` instance with `env.current_game`. Args: max_iterations: maximum number of steps. obs_type: type of observation to return. default_reward: default reward if reward is None returned by the `pycolab` game. action_space: the action `Space` of the environment. delay: renderer delay. resize_scale: number of pixels per observation pixel. Used only by the renderer. crop_window: dimensions of observation cropping. """ assert max_iterations > 0 assert isinstance(default_reward, numbers.Number) self._max_iterations = max_iterations self._default_reward = default_reward # At this point, the game would only want to access the random # property, although it is set to None initially. self.np_random = None self._colors = self.make_colors() test_game = self.make_game() test_game.the_plot.info = {} observations, _, _ = test_game.its_showtime() layers = list(observations.layers.keys()) not_ordered = list(set(layers) - set(test_game.z_order)) self._render_order = list(reversed(not_ordered + test_game.z_order)) # Create the observation space. self.obs_type = obs_type if self.obs_type == 'mask': self.observation_space = spaces.Box( 0., 1., [len(self.state_layer_chars)] + crop_window) # don't count empty space layer elif self.obs_type == 'rgb': self.observation_space = spaces.Box( 0., 255., [crop_window[0] * resize_scale, crop_window[1] * resize_scale ] + [3]) self.action_space = action_space self.act_null_value = act_null_value self.current_game = None self._croppers = [] self._state = None self._last_uncropped_observations = None self._empty_uncropped_board = None self._last_cropped_observations = None self._empty_cropped_board = None self._last_reward = None self._game_over = False self.viewer = None self.resize_scale = resize_scale self.delay = delay # Metrics self.visitation_frequency = {char: 0 for char in self.objects} self.first_visit_time = {char: 500 for char in self.objects} # Heatmaps self.episodes = 0 # number of episodes run (to determine when to save heatmaps) self.heatmap_save_freq = 3 # save heatmaps every 3 episodes self.heatmap = np.ones( (5, 5)) # stores counts each episode (5x5 is a placeholder) def pycolab_init(self, logdir, log_heatmaps): self.log_heatmaps = log_heatmaps root_path = os.path.abspath(__file__).split('/')[1:] root_path = root_path[:root_path.index('curiosity_baselines') + 1] self.heatmap_path = '/' + '/'.join(root_path) + '/' + '/'.join( logdir.split('/')[1:]) + '/heatmaps' if os.path.isdir(self.heatmap_path) == False and log_heatmaps == True: os.makedirs(self.heatmap_path) @abc.abstractmethod def make_game(self): """Function that creates a new pycolab game. Returns: pycolab.Engine. """ pass def make_colors(self): """Functions that returns colors. Returns: Dictionary mapping key name to `tuple(R, G, B)`. """ return { 'P': (255., 255., 255.), 'a': (175., 255., 15.), 'b': (21., 0., 255.), 'c': (250., 0., 129.), 'd': (0., 250., 71.), 'e': (255., 0., 0.), 'f': (252., 28., 3.), 'g': (136., 3., 252.), 'h': (20., 145., 60.), '#': (61., 61., 61.), '@': (255., 255., 0.), ' ': (0., 0., 0.) } def _paint_board(self, layers, cropped=False): """Method to privately paint layers to RGB. Args: layers: a dictionary mapping a character to the respective curtain. cropped: whether or not this is being called to paint cropped or uncropped images. Returns: 3D np.array (np.uint32) representing the RGB of the observation layers. """ if not cropped: board_shape = self._last_uncropped_observations.board.shape else: board_shape = self._last_cropped_observations.board.shape board = np.zeros(list(board_shape) + [3], np.uint32) board_mask = np.zeros(list(board_shape) + [3], np.bool) for key in self._render_order: color = self._colors.get(key, (0, 0, 0)) color = np.reshape(color, [1, 1, -1]).astype(np.uint32) # Broadcast the layer to [H, W, C]. board_layer_mask = np.array(layers[key])[..., None] board_layer_mask = np.repeat(board_layer_mask, 3, axis=-1) # Update the board with the new layer. board = np.where(np.logical_not(board_mask), board_layer_mask * color, board) # Update the mask. board_mask = np.logical_or(board_layer_mask, board_mask) return board def _update_for_game_step(self, observations, reward): """Update internal state with data from an environment interaction.""" # disentangled one hot state if self.obs_type == 'mask': self._state = [] for char in self.state_layer_chars: if char != ' ': mask = observations.layers[char].astype(float) if char in self.objects and 1. in mask: self.visitation_frequency[char] += 1 self._state.append(mask) self._state = np.array(self._state) elif self.obs_type == 'rgb': rgb_img = self._paint_board(observations.layers, cropped=True).astype(float) self._state = self.resize(rgb_img) for char in self.state_layer_chars: if char != ' ': mask = observations.layers[char].astype(float) if char in self.objects and 1. in mask: self.visitation_frequency[char] += 1 # update heatmap metric if self.log_heatmaps == True: pr, pc = self.current_game.things['P'].position self.heatmap[pr, pc] += 1 self._last_reward = reward if reward is not None else \ self._default_reward self._game_over = self.current_game.game_over if self.current_game.the_plot.frame >= self._max_iterations: self._game_over = True def reset(self): """Start a new episode.""" self.current_game = self.make_game() for cropper in self._croppers: cropper.set_engine(self.current_game) self._colors = self.make_colors() self.current_game.the_plot.info = {} self._game_over = None self._last_observations = None self._last_reward = None observations, reward, _ = self.current_game.its_showtime() self._last_uncropped_observations = observations self._empty_uncropped_board = np.zeros_like( self._last_uncropped_observations.board) if len(self._croppers) > 0: observations = [ cropper.crop(observations) for cropper in self._croppers ][0] self._last_cropped_observations = observations self._empty_cropped_board = np.zeros_like( self._last_cropped_observations) # save and reset metrics self.visitation_frequency = {char: 0 for char in self.objects} if self.log_heatmaps == True and self.episodes % self.heatmap_save_freq == 0: np.save('{}/{}.npy'.format(self.heatmap_path, self.episodes), self.heatmap) heatmap_normed = self.heatmap / np.linalg.norm(self.heatmap) plt.imsave('{}/{}.png'.format(self.heatmap_path, self.episodes), heatmap_normed, cmap='afmhot', vmin=0.0, vmax=1.0) self.episodes += 1 self.heatmap = np.zeros(self._last_uncropped_observations.board.shape) # run update self._update_for_game_step(observations, reward) return self._state def step(self, action): """Apply action, step the world forward, and return observations. Args: action: the desired action to apply to the environment. Returns: state, reward, done, info. """ if self.current_game is None: logger.warn("Episode has already ended, call `reset` instead..") self._state = None reward = self._last_reward done = self._game_over return self._state, reward, done, {} # Execute the action in pycolab. self.current_game.the_plot.info = {} observations, reward, _ = self.current_game.play(action) self._last_uncropped_observations = observations self._empty_uncropped_board = np.zeros_like( self._last_uncropped_observations.board) # Crop and update if len(self._croppers) > 0: observations = [ cropper.crop(observations) for cropper in self._croppers ][0] self._last_cropped_observations = observations self._empty_cropped_board = np.zeros_like( self._last_cropped_observations.board) self._update_for_game_step(observations, reward) info = self.current_game.the_plot.info # Add custom metrics info['visitation_frequency'] = self.visitation_frequency info['first_time_visit'] = self.first_visit_time # Check the current status of the game. reward = self._last_reward done = self._game_over if self._game_over: self.current_game = None return self._state, reward, done, info def render(self, mode='rgb_array', close=False): """Render the board to an image viewer or an np.array. Args: mode: One of the following modes: - 'human': render to an image viewer. - 'rgb_array': render to an RGB np.array (np.uint8) Returns: 3D np.array (np.uint8) or a `viewer.isopen`. """ img = self._empty_uncropped_board if self._last_uncropped_observations: img = self._last_uncropped_observations.board layers = self._last_uncropped_observations.layers if self._colors: img = self._paint_board(layers, cropped=False) else: assert img is not None, '`board` must not be `None`.' img = self.resize(img) if mode == 'rgb_array': return img elif mode == 'human': if self.viewer is None: from gym.envs.classic_control.rendering import ( SimpleImageViewer) self.viewer = SimpleImageViewer() self.viewer.imshow(img) time.sleep(self.delay / 1e3) return self.viewer.isopen def resize(self, img): img = _repeat_axes(img, self.resize_scale, axis=[0, 1]) if len(img.shape) != 3: img = np.repeat(img[..., None], 3, axis=-1) return img.astype(np.uint8) def seed(self, seed=None): """Seeds the environment. Args: seed: seed of the random engine. Returns: [seed]. """ self.np_random, seed = seeding.np_random(seed) return [seed] def close(self): """Tears down the renderer.""" if self.viewer: self.viewer.close() self.viewer = None
class GridWorld(gym.Env): metadata = {'render.modes': ['human']} def __init__(self, file_name="map1.txt", fail_rate=0.0, terminal_reward=1.0, move_reward=0.0, bump_reward=-0.5, bomb_reward=-1.0): self.viewer = SimpleImageViewer() self.n = None self.m = None self.bombs = [] self.walls = [] self.goals = [] self.start = None this_file_path = os.path.dirname(os.path.realpath(__file__)) file_name = os.path.join(this_file_path, file_name) with open(file_name, "r") as f: for i, row in enumerate(f): row = row.rstrip('\r\n') if self.n is not None and len(row) != self.n: raise ValueError( "Map's rows are not of the same dimension...") self.n = len(row) for j, col in enumerate(row): if col == "x" and self.start is None: self.start = self.n * i + j elif col == "x" and self.start is not None: raise ValueError( "There is more than one starting position in the map..." ) elif col == "G": self.goals.append(self.n * i + j) elif col == "B": self.bombs.append(self.n * i + j) elif col == "1": self.walls.append(self.n * i + j) self.m = i + 1 if len(self.goals) == 0: raise ValueError("At least one goal needs to be specified...") self.n_states = self.n * self.m self.n_actions = 4 self.fail_rate = fail_rate self.state = self.start self.terminal_reward = terminal_reward self.move_reward = move_reward self.bump_reward = bump_reward self.bomb_reward = bomb_reward self.action_space = spaces.Discrete(4) self.observation_space = spaces.Discrete(self.n_states) self.done = False def step(self, action): assert self.action_space.contains(action) if self.state in self.goals or np.random.rand() < self.fail_rate: return self.state, 0.0, self.done, None else: new_state = self.take_action(action) reward = self.get_reward(new_state) self.state = new_state return self.state, reward, self.done, None def reset(self): self.done = False self.state = self.start return self.state def render(self, mode='human', close=False): if close: if self.viewer is not None: self.viewer.close() self.viewer = None return if mode == 'human': grid = np.multiply(np.ones((self.n_states, 3), dtype=np.int8), np.array([0, 255, 0], dtype=np.int8)) for g in self.goals: grid[g] = np.array([255, 0, 0]) for b in self.bombs: grid[b] = np.array([255, 255, 0]) for w in self.walls: grid[w] = np.array([0, 0, 0]) grid[self.state] = np.array([0, 0, 255]) grid = grid.reshape(self.m, self.n, 3) self.viewer.imshow(grid) return self.viewer.isopen elif mode == "rgb_array": return grid else: return def take_action(self, action): row = self.state / self.n col = self.state % self.n if action == DOWN and (row + 1) * self.n + col not in self.walls: row = min(row + 1, self.m - 1) elif action == UP and (row - 1) * self.n + col not in self.walls: row = max(0, row - 1) elif action == RIGHT and row * self.n + col + 1 not in self.walls: col = min(col + 1, self.n - 1) elif action == LEFT and row * self.n + col - 1 not in self.walls: col = max(0, col - 1) new_state = row * self.n + col return new_state def get_reward(self, new_state): if new_state in self.goals: self.done = True return self.terminal_reward elif new_state in self.bombs: return self.bomb_reward elif new_state == self.state: return self.bump_reward return self.move_reward
class PyColabEnv(gym.Env): metadata = { 'render.modes': ['human', 'rgb_array'], } def __init__(self, max_iterations, obs_type, default_reward, action_space, act_null_value=4, delay=30, resize_scale=8, crop_window=[5, 5], visitable_states=0, color_palette=0, reward_switch=[], reward_config=dict(), switch_perturbations=[], dimensions=(19, 19)): """Create an `PyColabEnv` adapter to a `pycolab` game as a `gym.Env`. You can access the `pycolab.Engine` instance with `env.current_game`. Args: max_iterations: maximum number of steps. obs_type: type of observation to return. default_reward: default reward if reward is None returned by the `pycolab` game. action_space: the action `Space` of the environment. delay: renderer delay. resize_scale: number of pixels per observation pixel. Used only by the renderer. crop_window: dimensions of observation cropping. visitable_states: number of states the agent can visit. color_palette: which color palette to use for objects. reward_switch: list of objects or coords if the reward function switches. reward_config: list of objects and their associated rewards. switch_perturbations: color perturbations if a background switch is applied. dimensions: dimensions of the game board """ assert max_iterations > 0 assert isinstance(default_reward, numbers.Number) self._max_iterations = max_iterations # Reward specs self._default_reward = default_reward self._switch = 0 self._reward_switch = reward_switch self._reward_target = None self._switch_perturbations = switch_perturbations self._reward_config = reward_config # At this point, the game would only want to access the random # property, although it is set to None initially. self.np_random = None self._color_palette = color_palette self._colors = self.make_colors() test_game = self.make_game(reward_config=self._reward_config) test_game.the_plot.info = {} observations, _, _ = test_game.its_showtime() layers = list(observations.layers.keys()) not_ordered = list(set(layers) - set(test_game.z_order)) self._render_order = list(reversed(not_ordered + test_game.z_order)) # Prepare observation space. self.obs_type = obs_type self.height, self.width = dimensions self.crop_window = crop_window self.action_space = action_space if self.obs_type == 'mask': self.observation_space = spaces.Box( 0., 1., [len(self.state_layer_chars)] + self.crop_window) # don't count empty space layer elif self.obs_type == 'rgb': self.observation_space = spaces.Box( 0., 255., [self.crop_window[0] * 17, self.crop_window[1] * 17] + [3]) elif self.obs_type == 'rgb_full': if 84 % self.width == 0: self.observation_space = spaces.Box(0., 255., [84, 84] + [3]) else: self.observation_space = spaces.Box(0., 255., [85, 85] + [3]) self.act_null_value = act_null_value self.visitable_states = visitable_states self.current_game = None self._croppers = [] self._state = None self._last_uncropped_observations = None self._empty_uncropped_board = None self._last_cropped_observations = None self._empty_cropped_board = None self._last_reward = None self._game_over = False self.viewer = None self.delay = delay # Metrics self.visitation_frequency = {char: 0 for char in self.objects} self.first_visit_time = {char: 500 for char in self.objects} self.visitation_entropy = 0 self.num_obj_eps = {char: 0 for char in self.objects} self.coverage = 0 def heatmap_init(self, logdir, log_heatmaps): self.episodes = 0 # number of episodes run (to determine when to save heatmaps) self.heatmap_save_freq = 3 # save heatmaps every 3 episodes self.heatmap = np.zeros( (5, 5)) # stores counts each episode (5x5 is a placeholder) self.log_heatmaps = log_heatmaps root_path = os.path.abspath(__file__).split('/')[1:] root_path = root_path[:root_path.index('curiosity_baselines') + 1] self.heatmap_path = '/' + '/'.join(root_path) + '/' + '/'.join( logdir.split('/')[1:]) + '/heatmaps' self.startup = True if os.path.isdir(self.heatmap_path) == False and log_heatmaps == True: os.makedirs(self.heatmap_path) elif os.path.isdir(self.heatmap_path) == True: heatmaps = os.listdir(self.heatmap_path) if len(heatmaps) != 0: sorted_images = sorted(heatmaps, key=lambda img: int(img.split('.')[0])) last_episode = int(sorted_images[-1].split('.')[0]) self.episodes = last_episode def obs_init(self, resize_scale): self.resize_scale = resize_scale @abc.abstractmethod def make_game(self): """Function that creates a new pycolab game. Returns: pycolab.Engine. """ pass def make_colors(self): """Functions that returns colors. Returns: Dictionary mapping key name to `tuple(R, G, B)`. """ if self._color_palette == 0: return { 'P': (255., 255., 255.), 'a': (175., 255., 15.), 'b': (21., 0., 255.), 'c': (255., 0., 0.), 'd': (19., 139., 67.), 'e': (250., 0., 129.), 'f': (114., 206., 227.), 'g': (136., 3., 252.), 'h': (245., 119., 34.), '#': (61., 61., 61.), '@': (90., 90., 90.), ' ': (0., 0., 0.), '.': (110., 35., 35.) } elif self._color_palette == 1: return { 'P': (255., 255., 255.), 'a': (136., 3., 252.), 'b': (21., 0., 255.), 'c': (255., 0., 0.), 'd': (19., 139., 67.), 'e': (150., 0., 129.), '#': (61., 61., 61.), '@': (90., 90., 90.), ' ': (0., 0., 0.), '.': (110., 35., 35.) } elif self._color_palette == 2: return { 'P': (255., 255., 255.), 'a': (255., 0., 0.), 'b': (255., 0., 0.), 'c': (255., 0., 0.), 'd': (255., 0., 0.), 'e': (255., 0., 0.), 'f': (255., 0., 0.), 'g': (255., 0., 0.), 'h': (255., 0., 0.), '#': (61., 61., 61.), '@': (90., 90., 90.), ' ': (0., 0., 0.), '.': (110., 35., 35.) } elif self._color_palette == 3: return { 'P': (255., 255., 255.), 'a': (30., 60., 90.), 'b': (90., 60., 30.), 'c': (90., 30., 60.), 'd': (10., 100., 70.), 'e': (10., 10., 160.), 'f': (25., 130., 25.), 'g': (50., 40., 90.), 'h': (130., 25., 25.), '#': (61., 61., 61.), '@': (90., 90., 90.), ' ': (0., 0., 0.), '.': (110., 35., 35.) } def _check_visit(self, char): """Private method to check if the player has visited "char". A visit is when the character is within the 5x5 tile window around the player. """ pr, pc = self.current_game.things['P'].position cr, cc = self.current_game.things[char].position if (pr - 2) <= cr <= (pr + 2) and (pc - 2) <= cc <= (pc + 2): return True return False def _paint_board(self, layers, cropped=False): """Method to privately paint layers to RGB. Args: layers: a dictionary mapping a character to the respective curtain. cropped: whether or not this is being called to paint cropped or uncropped images. Returns: 3D np.array (np.uint32) representing the RGB of the observation layers. """ if not cropped: board_shape = self._last_uncropped_observations.board.shape else: board_shape = self._last_cropped_observations.board.shape board = np.zeros(list(board_shape) + [3], np.uint32) board_mask = np.zeros(list(board_shape) + [3], np.bool) for key in self._render_order: color = self._colors.get(key, (0, 0, 0)) color = np.reshape(color, [1, 1, -1]).astype(np.uint32) # Broadcast the layer to [H, W, C]. board_layer_mask = np.array(layers[key])[..., None] board_layer_mask = np.repeat(board_layer_mask, 3, axis=-1) # @ correspond to white noise or changing background perturbation = np.zeros(board_layer_mask.shape) if key == '@': if len(self._reward_switch) > 0: perturbation = self._switch_perturbations[self._switch] else: h, w = board_layer_mask.shape[:2] perturbation = np.random.randint(-15, 15, (h, w, 1)) # Update the board with the new layer. board = np.where(np.logical_not(board_mask), board_layer_mask * color + perturbation, board) # Update the mask. board_mask = np.logical_or(board_layer_mask, board_mask) return board def _update_for_game_step(self, observations, reward): """Update internal state with data from an environment interaction.""" # disentangled one hot state if self.obs_type == 'mask': self._state = [] for char in self.state_layer_chars: if char in self.objects: mask = observations.layers[char].astype(float) if char in self.objects and 1. in mask: self.visitation_frequency[char] += 1 self._state.append(mask) self._state = np.array(self._state) elif 'rgb' in self.obs_type: if self.obs_type == 'rgb': rgb_img = self._paint_board(observations.layers, cropped=True).astype(float) elif self.obs_type == 'rgb_full': rgb_img = self._paint_board(observations.layers, cropped=False).astype(float) self._state = self.resize(rgb_img) for char in self.state_layer_chars: if char in self.objects: mask = observations.layers[char].astype(float) if self._check_visit(char): self.visitation_frequency[char] += 1 # update heatmap metric if self.log_heatmaps == True: pr, pc = self.current_game.things['P'].position self.heatmap[pr, pc] += 1 self.visitation_entropy = entropy(self.heatmap.flatten(), base=self.visitable_states) self.coverage = np.count_nonzero( self.heatmap) / self.visitable_states # update reward self._last_reward = reward if reward is not None else self._default_reward self._game_over = self.current_game.game_over if self.current_game.the_plot.frame >= self._max_iterations: self._game_over = True def step(self, action): """Apply action, step the world forward, and return observations. Args: action: the desired action to apply to the environment. Returns: state, reward, done, info. """ if self.current_game is None: logger.warn("Episode has already ended, call `reset` instead..") self._state = None reward = self._last_reward done = self._game_over return self._state, reward, done, {} # Execute the action in pycolab. self.current_game.the_plot.info = {} observations, reward, _ = self.current_game.play(action) self._last_uncropped_observations = observations self._empty_uncropped_board = np.zeros_like( self._last_uncropped_observations.board) # Crop and update if len(self._croppers) > 0: observations = [ cropper.crop(observations) for cropper in self._croppers ][0] self._last_cropped_observations = observations self._empty_cropped_board = np.zeros_like( self._last_cropped_observations.board) self._update_for_game_step(observations, reward) info = self.current_game.the_plot.info # Add custom metrics info['visitation_frequency'] = self.visitation_frequency info['first_time_visit'] = self.first_visit_time info['visitation_entropy'] = self.visitation_entropy info['coverage'] = self.coverage info['episodes'] = self.episodes info['num_obj_eps'] = self.num_obj_eps for ob in self.objects: pushes = getattr(self.current_game.things[ob], 'pushes', None) if pushes is not None: info['controllable_interactions'] = pushes # Check the current status of the game. reward = self._last_reward done = self._game_over if self._game_over: self.current_game = None return self._state, reward, done, info def reset(self): """Start a new episode.""" if len(self._reward_switch) > 0: self._switch = np.random.randint(len(self._reward_switch)) self._reward_target = self._reward_switch[self._switch] self._reward_config = {char: 0.0 for char in self._reward_switch} self._reward_config[self._reward_switch[self._switch]] = 1.0 self.current_game = self.make_game(reward_config=self._reward_config) for cropper in self._croppers: cropper.set_engine(self.current_game) self._colors = self.make_colors() self.current_game.the_plot.info = {} self._game_over = None self._last_observations = None self._last_reward = None observations, reward, _ = self.current_game.its_showtime() self._last_uncropped_observations = observations self._empty_uncropped_board = np.zeros_like( self._last_uncropped_observations.board) if len(self._croppers) > 0: observations = [ cropper.crop(observations) for cropper in self._croppers ][0] self._last_cropped_observations = observations self._empty_cropped_board = np.zeros_like( self._last_cropped_observations) # save and reset metrics for char in self.objects: if self.visitation_frequency[char] > 0: self.num_obj_eps[char] += 1 self.visitation_frequency = {char: 0 for char in self.objects} if self.log_heatmaps == True and self.episodes % self.heatmap_save_freq == 0 and self.startup == False: np.save('{}/{}.npy'.format(self.heatmap_path, self.episodes), self.heatmap) heatmap_normed = self.heatmap / np.linalg.norm( self.heatmap) + 0.0000000000000000001 plt.imsave('{}/{}.png'.format(self.heatmap_path, self.episodes), heatmap_normed, cmap='afmhot', vmin=0.0, vmax=1.0) self.episodes += 1 self.startup = False self.heatmap = np.zeros(self._last_uncropped_observations.board.shape) # run update self._update_for_game_step(observations, reward) return self._state def render(self, mode='rgb_array', close=False): """Render the board to an image viewer or an np.array. Args: mode: One of the following modes: - 'human': render to an image viewer. - 'rgb_array': render to an RGB np.array (np.uint8) Returns: 3D np.array (np.uint8) or a `viewer.isopen`. """ img = self._empty_uncropped_board if self._last_uncropped_observations: img = self._last_uncropped_observations.board layers = self._last_uncropped_observations.layers if self._colors: img = self._paint_board(layers, cropped=False) else: assert img is not None, '`board` must not be `None`.' img = self.resize(img, scale=17) if mode == 'rgb_array': return img elif mode == 'human': if self.viewer is None: from gym.envs.classic_control.rendering import ( SimpleImageViewer) self.viewer = SimpleImageViewer() self.viewer.imshow(img) time.sleep(self.delay / 1e3) return self.viewer.isopen def resize(self, img, scale=None): if scale is None: img = _repeat_axes(img, self.resize_scale, axis=[0, 1]) else: img = _repeat_axes(img, scale, axis=[0, 1]) if len(img.shape) != 3: img = np.repeat(img[..., None], 3, axis=-1) return img.astype(np.uint8) def seed(self, seed=None): """Seeds the environment. Args: seed: seed of the random engine. Returns: [seed]. """ self.np_random, seed = seeding.np_random(seed) return [seed] def close(self): """Tears down the renderer.""" if self.viewer: self.viewer.close() self.viewer = None
class KrazyGridWorld: def __init__(self, screen_height, grid_squares_per_row=10, one_hot_obs=True, seed=42, task_seed=None, init_pos_seed=None, death_square_percentage=0.1, ice_sq_perc=0.05, num_goals=3, min_goal_distance=2, max_goal_distance=np.inf, num_steps_before_energy_needed=11, energy_replenish=8, energy_sq_perc=0.05, num_transporters=1, sparse_rewards=True, image_obs=True, use_local_obs=False): if task_seed is None: task_seed = seed if init_pos_seed is None: init_pos_seed = seed self.init_pos_rng = np.random.RandomState(init_pos_seed) self.task_rng = np.random.RandomState(task_seed) random.seed(task_seed) self.one_hot_obs = one_hot_obs self.image_obs = image_obs self.use_local_obs = use_local_obs self.screen_dim = (screen_height, screen_height) # width and height self.tile_types = TileTypes() self.agent = Agent( num_steps_until_energy_needed=num_steps_before_energy_needed, energy_replenish=energy_replenish) self.game_grid = GameGrid(grid_squares_per_row=grid_squares_per_row, tile_types=self.tile_types, agent=self.agent, task_rng=self.task_rng, death_sq_perc=death_square_percentage, energy_sq_perc=energy_sq_perc, ice_sq_perc=ice_sq_perc, num_goals=num_goals, min_goal_distance=min_goal_distance, max_goal_distance=max_goal_distance, num_transporters=num_transporters) self.num_goals_obtained = 0 self.sparse_reward = sparse_rewards self.reset_task() self.simple_image_viewer = None self.last_im_obs = None def reset(self, reset_agent_start_pos=False, reset_board=False, reset_colors=False, reset_dynamics=False): self.agent.dead = False self.agent.agent_position = copy.deepcopy( self.agent.agent_position_init) self.agent.num_steps_until_energy_needed = copy.deepcopy( self.agent.energy_init) self.num_goals_obtained = 0 self.game_grid.grid_np = copy.deepcopy(self.game_grid.game_grid_init) if reset_colors: self.tile_types.reset_colors() if reset_dynamics: self.agent.change_dynamics() if reset_board: self.reset_task() if reset_agent_start_pos: self.reset_agent_start_position() return self.get_obs() def reset_task(self): # reset the entire board and agent start position, generating a new MDP. self.game_grid.get_new_game_grid() self.reset_agent_start_position() def reset_agent_start_position(self): # keep the previous board but update the agents starting position. # keeps the previous MDP but samples x_0. new_start = self.game_grid.get_one_non_agent_square() self.agent.agent_position = new_start self.agent.agent_position_init = new_start def get_obs(self): if self.image_obs: return self.get_img_obs() else: return None def step(self, a, render=False): if self.agent.dead is False: proposed_step = self.agent.try_step(a) if self.game_grid.is_position_legal(proposed_step): self.agent.agent_position = proposed_step self.check_dead() self.check_at_goal() self.check_at_energy() self.check_at_transporter() # this shit handles the ice squares while True: if self.check_at_ice_square() is False: break else: # don't take energy for going over ice. self.agent.num_steps_until_energy_needed += 1 proposed_step_nu = self.agent.try_step(a) if self.game_grid.is_position_legal(proposed_step_nu): self.step(a) else: break if self.agent.num_steps_until_energy_needed < 1: self.agent.dead = True if render: self.render() return self.get_obs(), self.get_reward(), self.agent.dead, dict() def check_dead(self): agent_pos = self.agent.agent_position game_grid = self.game_grid.grid_np if game_grid[agent_pos[0], agent_pos[1]] == self.tile_types.death: self.agent.dead = True def check_at_goal(self): if self.game_grid.grid_np[ self.agent.agent_position[0], self.agent.agent_position[1]] == self.tile_types.goal: self.game_grid.grid_np[ self.agent.agent_position[0], self.agent.agent_position[1]] = self.tile_types.normal self.num_goals_obtained += 1 def check_at_energy(self): if self.game_grid.grid_np[ self.agent.agent_position[0], self.agent.agent_position[1]] == self.tile_types.energy: self.game_grid.grid_np[ self.agent.agent_position[0], self.agent.agent_position[1]] = self.tile_types.normal self.agent.give_energy() def check_at_transporter(self): transport_sq = None if self.game_grid.grid_np[ self.agent.agent_position[0], self.agent.agent_position[1]] == self.tile_types.transporter: for tr in self.game_grid.transporters: if self.agent.agent_position[0] == tr[0][ 0] and self.agent.agent_position[1] == tr[0][1]: transport_sq = tr[1] elif self.agent.agent_position[0] == tr[1][ 0] and self.agent.agent_position[1] == tr[1][1]: transport_sq = tr[0] if transport_sq is not None: self.agent.agent_position = [transport_sq[0], transport_sq[1]] def check_at_ice_square(self): if self.game_grid.grid_np[ self.agent.agent_position[0], self.agent.agent_position[1]] == self.tile_types.ice: return True return False def render(self): if self.simple_image_viewer is None: from gym.envs.classic_control.rendering import SimpleImageViewer self.simple_image_viewer = SimpleImageViewer() im_obs = self.get_img_obs() self.simple_image_viewer.imshow(im_obs) time.sleep(0.075) def get_state_obs(self): grid_np = copy.deepcopy(self.game_grid.grid_np) agent_p = self.agent.agent_position grid_np[agent_p[0], agent_p[1]] = self.tile_types.agent grid_np = grid_np.astype(np.uint8) #agent_p = np.array(self.agent.agent_position) if self.one_hot_obs: n_values = np.max(grid_np) + 1 grid_np = np.eye(n_values)[grid_np] #agent_p_temp = np.zeros((self.game_grid.grid_squares_per_row, self.game_grid.grid_squares_per_row, 1)) #agent_p_temp[agent_p[0], agent_p[1], :] = 1 if self.use_local_obs: neighbors = [] x, y = self.agent.agent_position for _i, _j in [(-1, -1), (0, -1), (1, -1), (1, 0), (1, 1), (0, 1), (-1, 1), (-1, 0)]: i, j = (_i + x, _j + y) if 0 <= i < self.game_grid.grid_squares_per_row and 0 <= j < self.game_grid.grid_squares_per_row: neighbors.append([j, i]) else: neighbors.append(None) grid_np = np.array(neighbors) return grid_np.flatten() def get_img_obs(self): grid_np = copy.deepcopy(self.game_grid.grid_np) grid_np[self.agent.agent_position[0], self.agent.agent_position[1]] = self.tile_types.agent fake_img = np.zeros((self.game_grid.grid_squares_per_row, self.game_grid.grid_squares_per_row, 3)) for i in range(len(self.tile_types.all_tt)): is_grid_sq_color_i = grid_np == self.tile_types.all_tt[i] one_idxs = is_grid_sq_color_i.astype(int) one_idxs = np.tile(np.expand_dims(one_idxs, -1), 3) one_idxs = one_idxs * np.array(self.tile_types.colors[i].value) fake_img += one_idxs if self.use_local_obs: neighbors = [] x, y = self.agent.agent_position valid_idxs = np.zeros_like(fake_img) valid_idxs[x, y] = 1.0 for _i, _j in [(-1, -1), (0, -1), (1, -1), (1, 0), (1, 1), (0, 1), (-1, 1), (-1, 0)]: i, j = (_i + x, _j + y) if 0 <= i < self.game_grid.grid_squares_per_row and 0 <= j < self.game_grid.grid_squares_per_row: #neighbors.append([j, i]) valid_idxs[i, j] = 1.0 else: neighbors.append(None) fake_img *= valid_idxs res = cv2.resize(fake_img, dsize=(256, 256), interpolation=cv2.INTER_NEAREST) res = res.astype(np.uint8) return res def get_reward(self): if self.sparse_reward: return 0 + self.num_goals_obtained else: rew = 0 for goal in self.game_grid.goal_squares: dist_1 = abs(goal[0] - self.agent.agent_position[0]) dist_2 = abs(goal[1] - self.agent.agent_position[1]) rew = rew + dist_1 + dist_2 rew = -1.0 * rew rew = rew + 3.0 * self.num_goals_obtained return rew def close(self): self.simple_image_viewer.close()
class BattleshipEnv(gym.Env): reward_range = (-float('inf'), float('inf')) metadata = { 'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 3 } #hold_out>0: run env while excluding boards in heldout/<rules>.npy. #hold_out<0: run env cycling through boards in heldout/<rules>.npy #hold_out=0: run env on whole distribution without holding out boards (not used in paper). def __init__(self, rules='chain', n_board=7, hold_out=0): self.viewer = None self.seed() action_converter = [] for i in range(n_board): for j in range(n_board): action_converter.append((i, j)) self.action_converter = np.asarray(action_converter) self.n_board = n_board self.hold_out = hold_out self.rules = rules if hold_out == -1: self.heldout = np.load('held_out/' + rules + '.npy') self.maze_idx = 0 self.maze = np.reshape(self.heldout[self.maze_idx], (7, 7)) if self.rules in ['all', 'chain', 'tree', 'loop']: start = np.load('held_out/' + self.rules + '_starts.npy')[self.maze_idx] else: hit_idx = np.where(self.maze == 1) choice = np.random.choice(list(range(len(hit_idx[0]))), size=1) start = (hit_idx[0][choice], hit_idx[1][choice]) else: if hold_out > 0: heldout = np.load('held_out/' + rules + '.npy') self.heldout = set([tuple(x) for x in heldout]) gen = generate_grid(self.rules, n=self.n_board) if len(gen) == 2: grid, start = gen else: grid = gen hit_idx = np.where(grid == 1) choice = np.random.choice(list(range(len(hit_idx[0]))), size=1) start = (hit_idx[0][choice], hit_idx[1][choice]) if hold_out > 0: while tuple(grid.flatten()) in self.heldout: gen = generate_grid(self.rules, n=self.n_board) if len(gen) == 2: grid, start = gen else: grid = gen hit_idx = np.where(grid == 1) choice = np.random.choice(list(range(len(hit_idx[0]))), size=1) start = (hit_idx[0][choice], hit_idx[1][choice]) self.maze = grid self.board = np.ones(self.maze.shape) * -1 self.current_position = start self.board[self.current_position[0], self.current_position[1]] = 1 self.num_hits = 0 self.self_hits = {} self.observation_space = Box(low=-1, high=1, shape=(n_board * n_board + n_board * n_board + 1, ), dtype=np.float) self.action_space = Discrete(np.prod(self.maze.shape)) self.nA = n_board * n_board self.prev_reward = 0 self.prev_action = np.zeros((self.nA, )) self.valid_actions = [1 for _ in range(self.nA)] def seed(self, seed=None): self.np_random, seed = seeding.np_random(seed) return [seed] def step(self, action): prev_position = self.current_position self.current_position = self.action_converter[action] reward = 0 if self.board[self.current_position[0], self.current_position[1]] == -1: if self.maze[self.current_position[0], self.current_position[1]] == 1: self.board[self.current_position[0], self.current_position[1]] = 1 self.num_hits += 1 reward = 1 else: self.board[self.current_position[0], self.current_position[1]] = 0 reward = -1 else: reward = -2 if (self.current_position[0], self.current_position[1]) not in self.self_hits.keys(): self.self_hits[(self.current_position[0], self.current_position[1])] = 1 else: self.self_hits[(self.current_position[0], self.current_position[1])] += 1 if self._is_goal(): reward = +10 done = True if self.hold_out == -1: self.maze_idx += 1 else: done = False p_action = self.prev_action p_reward = self.prev_reward self.prev_action = np.zeros((self.nA, )) self.prev_action[action] = 1 self.prev_reward = reward obs = self.board.flatten() obs_array = np.concatenate((obs, p_action, [p_reward])) return obs_array, reward, done, {} def _is_goal(self): return np.sum(self.board == 1) == np.sum(self.maze == 1) def get_image(self): img = np.empty((*self.board.shape, 3), dtype=np.uint8) for i in range(self.board.shape[0]): for j in range(self.board.shape[1]): if self.board[i, j] == -1: img[i, j, :] = 255, 255, 255 elif self.board[i, j] == 1: img[i, j, :] = 255, 0, 0 if (i, j) in self.self_hits.keys(): if (255 - 10 * self.self_hits[(i, j)]) < 5: img[i, j, :] = 0, 0, 0 else: img[i, j, :] = (255 - 10 * self.self_hits[(i, j)]), 0, 0 else: img[i, j, :] = 0, 0, 255 if (i, j) in self.self_hits.keys(): if (255 - 10 * self.self_hits[(i, j)]) < 5: img[i, j, :] = 0, 0, 0 else: img[i, j, :] = 0, 0, (255 - 10 * self.self_hits[(i, j)]) return img def set_task(self, task): self.maze = task self.board = np.zeros(self.maze.shape) self.current_position = [ np.random.choice(range(self.maze.shape[0])), np.random.choice(self.maze.shape[1]) ] self.num_hits = 0 self.self_hits = {} return self.board.flatten() def reset(self): if self.hold_out == -1: self.maze = np.reshape( self.heldout[self.maze_idx % len(self.heldout)], (7, 7)) if self.rules in ['all', 'chain', 'tree', 'loop']: start = np.load('held_out/' + self.rules + '_starts.npy')[self.maze_idx % len(self.heldout)] else: hit_idx = np.where(self.maze == 1) choice = np.random.choice(list(range(len(hit_idx[0]))), size=1) start = (hit_idx[0][choice], hit_idx[1][choice]) else: gen = generate_grid(self.rules, n=self.n_board) if len(gen) == 2: grid, start = gen else: grid = gen hit_idx = np.where(grid == 1) choice = np.random.choice(list(range(len(hit_idx[0]))), size=1) start = (hit_idx[0][choice], hit_idx[1][choice]) if self.hold_out > 0: while tuple(grid.flatten()) in self.heldout: gen = generate_grid(self.rules, n=self.n_board) if len(gen) == 2: grid, start = gen else: grid = gen hit_idx = np.where(grid == 1) choice = np.random.choice(list(range(len(hit_idx[0]))), size=1) start = (hit_idx[0][choice], hit_idx[1][choice]) self.maze = grid self.board = np.ones(self.maze.shape) * -1 self.current_position = start self.board[self.current_position[0], self.current_position[1]] = 1 self.num_hits = 0 self.self_hits = {} obs = self.board.flatten() obs_array = np.concatenate((obs, self.prev_action, [self.prev_reward])) self.valid_actions = [1 for _ in range(self.nA)] return obs_array def render(self, mode='human', max_width=500): img = self.get_image() img = np.asarray(img).astype(np.uint8) img_height, img_width = img.shape[:2] ratio = max_width / img_width img = PILImage.fromarray(img).resize( [int(ratio * img_width), int(ratio * img_height)]) img = np.asarray(img) if mode == 'rgb_array': return img elif mode == 'human': from gym.envs.classic_control.rendering import SimpleImageViewer if self.viewer is None: self.viewer = SimpleImageViewer() self.viewer.imshow(img) return self.viewer.isopen def close(self): if self.viewer is not None: self.viewer.close() self.viewer = None
class TetrisEnv(gym.Env, gym.utils.EzPickle): """An environment for playing Tetris in OpenAI Gym.""" # meta-data about the environment for OpenAI Gym utilities (like Monitor) metadata = { 'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 30, } def __init__(self, max_steps: int, random_state: int = None) -> None: """ Initialize a new Tetris environment. Args: max_steps: the max number of steps per episode. random_state: the random seed to start the environment with Returns: None """ gym.utils.EzPickle.__init__(self) self.max_steps = max_steps self.viewer = None self.step_number = 0 # Setup the observation space as RGB game frames self.observation_space = gym.spaces.Box(low=0, high=255, shape=(SCREEN_HEIGHT, SCREEN_WIDTH, 3), dtype=np.uint8) # Setup the action space, the game defines 12 legal actions self.action_space = gym.spaces.Discrete(12) # setup the game self.game = Tetris() self.seed(random_state) @property def screen(self) -> np.ndarray: """Return the screen of the game""" return self.game.screen def reset(self) -> np.ndarray: """Reset the emulator and return the initial state.""" self.game.reset() # reset the step count self.step_number = 0 # return the initial screen from the game return self.game.screen def step(self, action: int) -> tuple: """ Take a step using the given action. Args: action: the discrete action to perform. will use the action in `self.actions` indexed by this value Returns: a tuple of: - the start as a result of the action - the reward achieved by taking the action - a flag denoting whether the episode has ended - a dictionary of extra information """ state, reward, done, info = self.game.step(action) self.step_number += 1 # if this step has passed the max number, set the episode to done if self.step_number >= self.max_steps: done = True return state, reward, done, info def render(self, mode: str = 'human'): """ Render the current screen using the given mode. Args: mode: the mode to render the screen using - 'human': render in a window using GTK - 'rgb_array': render in the back-end and return a matrix Returns: None if mode is 'human' or a matrix if mode is 'rgb_array' """ # if the mode is RGB, return the screen as a NumPy array if mode == 'rgb_array': return self.game.screen # if the mode is human, create a viewer and display the screen elif mode == 'human': from pyglet.window import Window from gym.envs.classic_control.rendering import SimpleImageViewer if self.viewer is None: self.viewer = SimpleImageViewer() self.viewer.window = Window( width=SCREEN_WIDTH, height=SCREEN_HEIGHT, caption=self.spec.id, ) self.viewer.imshow(self.game.screen) return self.viewer.isopen # otherwise the render mode is not supported, raise an error else: raise ValueError('unsupported render mode: {}'.format(repr(mode))) def close(self) -> None: """Close the emulator.""" # delete the existing game if there is one if isinstance(self.game, Tetris): del self.game if self.viewer is not None: self.viewer.close() del self.viewer def seed(self, random_state: int = None) -> list: """ Set the seed for this env's random number generator(s). Args: random_state: the seed to set the random generator to Returns: A list of seeds used in this env's random number generators """ random.seed(random_state) self.curr_seed = random_state return [self.curr_seed] def get_keys_to_action(self) -> dict: """Return the dictionary of keyboard keys to actions.""" # Map of in game directives to their associated keyboard value down = ord('s') left = ord('a') right = ord('d') rot_l = ord('q') rot_r = ord('e') # A mapping of pressed key combinations to discrete actions keys_to_action = { (): 0, (left, ): 1, (right, ): 2, (down, ): 3, (rot_l, ): 4, (rot_r, ): 5, tuple(sorted(( left, down, ))): 6, tuple(sorted(( right, down, ))): 7, tuple(sorted(( left, rot_l, ))): 8, tuple(sorted(( right, rot_l, ))): 9, tuple(sorted(( left, rot_r, ))): 10, tuple(sorted(( right, rot_r, ))): 11, } return keys_to_action
class PyColabEnv(gym.Env): metadata = { 'render.modes': ['human', 'rgb_array'], } def __init__(self, max_iterations, default_reward, action_space, delay=30, resize_scale=8): """Create an `PyColabEnv` adapter to a `pycolab` game as a `gym.Env`. You can access the `pycolab.Engine` instance with `env.current_game`. Args: max_iterations: maximum number of steps. default_reward: default reward if reward is None returned by the `pycolab` game. action_space: the action `Space` of the environment. delay: renderer delay. resize_scale: number of pixels per observation pixel. Used only by the renderer. """ assert max_iterations > 0 assert isinstance(default_reward, numbers.Number) self._max_iterations = max_iterations self._default_reward = default_reward # At this point, the game would only want to access the random # property, although it is set to None initially. self.np_random = None self._colors = self.make_colors() test_game = self.make_game() test_game.the_plot.info = {} observations, _, _ = test_game.its_showtime() layers = list(observations.layers.keys()) not_ordered = list(set(layers) - set(test_game.z_order)) self._render_order = list(reversed(not_ordered + test_game.z_order)) # Create the observation space. observation_layers = list(set(layers)) self._observation_order = sorted(observation_layers) channels = [3] channel_max = 255. channel_min = 0. self._game_shape = list(observations.board.shape) + channels self.observation_space = spaces.Box( low=np.full(self._game_shape, channel_min, np.float32), high=np.full(self._game_shape, channel_max, np.float32), dtype=np.float32) self.action_space = action_space self.current_game = None self._last_observations = None self._empty_board = None self._last_state = None self._last_reward = None self._game_over = False self.viewer = None self.resize_scale = resize_scale self.delay = delay @abc.abstractmethod def make_game(self): """Function that creates a new pycolab game. Returns: pycolab.Engine. """ pass def make_colors(self): """Functions that returns colors. Returns: Dictionary mapping key name to `tuple(R, G, B)`. """ return {} def _paint_board(self, layers): """Method to privately paint layers to RGB. Args: layers: a dictionary mapping a character to the respective curtain. Returns: 3D np.array (np.uint32) representing the RGB of the observation layers. """ board_shape = self._last_observations.board.shape board = np.zeros(list(board_shape) + [3], np.uint32) board_mask = np.zeros(list(board_shape) + [3], np.bool) for key in self._render_order: color = self._colors.get(key, (0, 0, 0)) color = np.reshape(color, [1, 1, -1]).astype(np.uint32) # Broadcast the layer to [H, W, C]. board_layer_mask = np.array(layers[key])[..., None] board_layer_mask = np.repeat(board_layer_mask, 3, axis=-1) # Update the board with the new layer. board = np.where(np.logical_not(board_mask), board_layer_mask * color, board) # Update the mask. board_mask = np.logical_or(board_layer_mask, board_mask) return board def _update_for_game_step(self, observations, reward): """Update internal state with data from an environment interaction.""" self._last_observations = observations self._empty_board = np.zeros_like(self._last_observations.board) self._last_state = self._paint_board(observations.layers).astype( np.float32) self._last_reward = reward if reward is not None else \ self._default_reward self._game_over = self.current_game.game_over if self.current_game.the_plot.frame >= self._max_iterations: self._game_over = True def reset(self): """Start a new episode.""" self.current_game = self.make_game() self._colors = self.make_colors() self.current_game.the_plot.info = {} self._game_over = None self._last_observations = None self._last_reward = None observations, reward, _ = self.current_game.its_showtime() self._update_for_game_step(observations, reward) return self._last_state def step(self, action): """Apply action, step the world forward, and return observations. Args: action: the desired action to apply to the environment. Returns: state, reward, done, info. """ if self.current_game is None: logger.warn("Episode has already ended, call `reset` instead..") state = self._last_state reward = self._last_reward done = self._game_over return state, reward, done, {} # Execute the action in pycolab. self.current_game.the_plot.info = {} observations, reward, _ = self.current_game.play(action) self._update_for_game_step(observations, reward) info = self.current_game.the_plot.info # Check the current status of the game. state = self._last_state reward = self._last_reward done = self._game_over if self._game_over: self.current_game = None return state, reward, done, info def render(self, mode='human'): """Render the board to an image viewer or an np.array. Args: mode: One of the following modes: - 'human': render to an image viewer. - 'rgb_array': render to an RGB np.array (np.uint8) Returns: 3D np.array (np.uint8) or a `viewer.isopen`. """ img = self._empty_board if self._last_observations: img = self._last_observations.board layers = self._last_observations.layers if self._colors: img = self._paint_board(layers) else: assert img is not None, '`board` must not be `None`.' img = _repeat_axes(img, self.resize_scale, axis=[0, 1]) if len(img.shape) != 3: img = np.repeat(img[..., None], 3, axis=-1) img = img.astype(np.uint8) if mode == 'rgb_array': return img elif mode == 'human': if self.viewer is None: from gym.envs.classic_control.rendering import ( SimpleImageViewer) self.viewer = SimpleImageViewer() self.viewer.imshow(img) time.sleep(self.delay / 1e3) return self.viewer.isopen def seed(self, seed=None): """Seeds the environment. Args: seed: seed of the random engine. Returns: [seed]. """ self.np_random, seed = seeding.np_random(seed) return [seed] def close(self): """Tears down the renderer.""" if self.viewer: self.viewer.close() self.viewer = None
class TrafficEnv(gym.Env): def __init__(self, nlanes, ncars, images=True, sh=50): self.ncars = ncars self.nlanes = nlanes self.images = images self.l_lims = np.array((0.0, 0.0)) # f, b self.h_lims = np.array((1.0, 1.0)) # f, b self.vva = 0.005 self.vlims = (0.01, 0.02) self.rsx = SWD / SHD self.sh = sh self.sw = int(self.sh * self.rsx) self.action_space = spaces.Tuple( (spaces.Discrete(3), spaces.Box(self.l_lims, self.h_lims))) if self.images: self.observation_space = spaces.Box(0.0, 1.0, shape=(self.sw, self.sh)) else: obs_low = np.array((0.0, 0.0, -self.vva) + (-self.rsx, -2.0, 0.0, -0.2) * (self.ncars - 1)) obs_high = np.array((self.rsx, 0.2, self.vva) + (self.rsx, 2.0, self.rsx, 0.2) * (self.ncars - 1)) self.observation_space = spaces.Box(obs_low, obs_high) self.lanes = (np.arange(self.nlanes) + 0.5) * (self.sw / self.sh) / self.nlanes self.hlanes = (np.arange(self.nlanes + 1) + 0.0) * ( (self.sw - 1) / self.sh) / self.nlanes self.viewer = None self.cars = [] self.is_final = False self.lanev = None self.reward_func = None self.np_random = None self.seed() def seed(self, seed=None): if seed is not None: np.random.seed(seed) return [seed] def reset(self): self.is_final = False car0y = 0.5 self.lanev = np.random.rand( self.nlanes) * (self.vlims[1] - self.vlims[0]) + self.vlims[0] self.cars = [ Car(self.lanes, int(self.nlanes / 2), car0y, self.lanev[int(self.nlanes / 2)]) ] for ii in range(1, self.ncars): self.cars.append(self.get_car()) return self.road_img(car0y - 0.5, self.sw, self.sh)[0] if self.images else self.get_state() def get_car(self, ymin=0, ymax=1, lane=None): c_lane = np.random.randint(self.nlanes) if lane is None else lane c = Car(self.lanes, c_lane, np.random.rand() * (ymax - ymin) + ymin, self.lanev[c_lane]) while self.car_overlaps(c): c_lane = np.random.randint(self.nlanes) if lane is None else lane c = Car(self.lanes, c_lane, np.random.rand() * (ymax - ymin) + ymin, self.lanev[c_lane]) return c def car_overlaps(self, c, margin=1.2): for ii in range(len(self.cars)): if (self.cars[ii].lane == c.lane) and \ (np.abs(self.cars[ii].py - c.py) < (margin*(self.cars[ii].sy + c.sy))): return True return False def step(self, action): assert self.action_space[0].contains( action[0]), 'Action {} is invalid.'.format(action[0]) assert self.action_space[1].contains( action[1]), 'Action {} is invalid.'.format(action[1]) if not self.is_final: if action[0] != 1: self.cars[0].va = (action[0] - 1) * self.vva for it in range(200): car0y = self.cars[0].py for c in self.cars[1:]: c.step(0.0, 1.0) self.cars[0].step(action[1][0] * 0.0005, (1.0 - action[1][1]) * 0.0001 + 0.99989) car0y = self.cars[0].py for ii in range(1, len(self.cars)): if (self.cars[ii].py - car0y) > 1: self.cars[ii] = self.get_car(car0y - 0.5, car0y - 1.0) elif (self.cars[ii].py - car0y) < -1: self.cars[ii] = self.get_car(car0y + 0.5, car0y + 1.0) rimg_i, cimgs_i = self.road_img(car0y - 0.5, self.sw, self.sh) self.is_final = (self.num_collisions(cimgs_i) > 0.0) or \ ((self.cars[0].px-self.cars[0].sx) <= 0.0) or \ ((self.cars[0].px+self.cars[0].sx) >= self.cars[0].rsx) reward = 0.0 if self.reward_func is None else self.reward_func( self.cars[0].px / self.rsx, self.cars[0].v, self.cars[0].va != 0.0, self.is_final) else: rimg_i, cimgs_i = self.road_img(self.cars[0].py - 0.5, self.sw, self.sh) self.is_final = True reward = 0.0 return rimg_i if self.images else self.get_state(), reward, int( self.is_final), {} def render(self, mode='human'): if self.viewer is None: self.viewer = SimpleImageViewer() rimg_i = self.road_img(self.cars[0].py - 0.5, SWD, SHD)[0] img = np.transpose( np.stack([((1.0 - rimg_i) * 255).astype(np.uint8)] * 3, axis=2), (1, 0, 2))[::-1, :, :] self.viewer.imshow(img) def close(self): if self.viewer is not None: self.viewer.close() self.viewer = None def car_imgs(self, y0, swi, shi): return np.transpose( np.array([c.render(y0, swi, shi) for c in self.cars]), (1, 2, 0)) def road_img(self, y0, swi, shi): cimgs_i = self.car_imgs(y0, swi, shi) lane_line = 1.0 * (np.sin( ((np.arange(shi) / shi) + y0) * 20 * 2 * np.pi) > 0) road_img = np.sum(cimgs_i[:, :, 1:] * 0.25, axis=2) + cimgs_i[:, :, 0] * 1.0 for l in self.hlanes: road_img[int(l * shi), :] = np.maximum(road_img[int(l * shi), :], lane_line * 0.75) return np.minimum(road_img, 1.0), cimgs_i def num_collisions(self, cimgs_i): return np.sum((np.sum(cimgs_i, axis=2) > 1.0).flatten()) def get_state(self): p0y = self.cars[0].py p0x = self.cars[0].px v0y = self.cars[0].v return np.concatenate( (np.array([p0x, v0y, self.cars[0].va]), ) + tuple( np.array([c.px - p0x, c.py - p0y, c.px, c.v - v0y]) for c in self.cars[1:]))
class PursuersEvaders(gym.Env): metadata = {'render.modes': ['human']} def __init__(self, file_name="mmap1.txt", catch_level=2, terminal_reward=10.0, ontarget_reward=1.0, move_reward=0.0, bump_reward=-0.2): self.viewer = SimpleImageViewer() self.n = None self.m = None self.catch_level = catch_level self.walls = [] self.init_evaders = [] self.init_pursuers = [] this_file_path = os.path.dirname(os.path.realpath(__file__)) file_name = os.path.join(this_file_path, file_name) with open(file_name, "r") as f: for i, row in enumerate(f): row = row.rstrip('\r\n') if self.n is not None and len(row) != self.n: raise ValueError( "Map's rows are not of the same dimension...") self.n = len(row) for j, col in enumerate(row): if col == "P": self.init_pursuers.append(self.n * i + j) elif col == "E": self.init_evaders.append(self.n * i + j) elif col == "1": self.walls.append(self.n * i + j) self.m = i + 1 if self.m < 3 or self.n < 3: raise ValueError("Map too small...") if len(self.init_pursuers) < self.catch_level: raise ValueError( "At least a sufficient number of pursuers needs to be specified..." ) if len(self.init_evaders) == 0: raise ValueError("At least one evaders needs to be specified...") self.evaders = copy.copy(self.init_evaders) self.pursuers = copy.copy(self.init_pursuers) self.n_states = self.n * self.m self.n_actions = 5**len(self.init_pursuers) self.terminal_reward = terminal_reward self.ontarget_reward = ontarget_reward self.move_reward = move_reward self.bump_reward = bump_reward self.action_space = spaces.Box(0, 4, (len(self.init_pursuers), )) self.observation_space = spaces.Box(-1, 3, (3, 3)) self.done = False def step(self, action): assert self.action_space.contains(action) if len(self.evaders) == 0: return self.build_observation(), 0.0, self.done, None else: new_state = self.take_action(action) reward = self.get_reward(new_state, action) self.pursuers = new_state self.take_evaders_action() return self.build_observation(), reward, self.done, None def reset(self): self.done = False self.evaders = copy.copy(self.init_evaders) self.pursuers = copy.copy(self.init_pursuers) return self.build_observation() def render(self, mode='human', close=False): if close: if self.viewer is not None: self.viewer.close() self.viewer = None return if mode == 'human': grid = np.multiply(np.ones((self.n_states, 3), dtype=np.int8), np.array([0, 255, 0], dtype=np.int8)) for e in self.evaders: grid[e] = np.array([255, 0, 0]) for w in self.walls: grid[w] = np.array([0, 0, 0]) for p in self.pursuers: grid[p] = np.array([0, 0, 255]) grid = grid.reshape(self.m, self.n, 3) self.viewer.imshow(grid) return self.viewer.isopen elif mode == "rgb_array": return grid else: return def take_action(self, action): new_state = [] for a, p in zip(action, self.pursuers): row = p / self.n col = p % self.n if a == DOWN and (row + 1) * self.n + col not in self.walls: row = min(row + 1, self.m - 1) elif a == UP and (row - 1) * self.n + col not in self.walls: row = max(0, row - 1) elif a == RIGHT and row * self.n + col + 1 not in self.walls: col = min(col + 1, self.n - 1) elif a == LEFT and row * self.n + col - 1 not in self.walls: col = max(0, col - 1) new_state.append(row * self.n + col) return new_state def take_evaders_action(self): new_goals = [] for e in self.evaders: row = e / self.n col = e % self.n a = np.random.randn(0, 5) if a == DOWN and (row + 1) * self.n + col not in self.walls: row = min(row + 1, self.m - 1) elif a == UP and (row - 1) * self.n + col not in self.walls: row = max(0, row - 1) elif a == RIGHT and row * self.n + col + 1 not in self.walls: col = min(col + 1, self.n - 1) elif a == LEFT and row * self.n + col - 1 not in self.walls: col = max(0, col - 1) new_goals.append(row * self.n + col) self.evaders = new_goals def get_reward(self, new_state, action): reward = 0.0 for i, p in enumerate(new_state): n = 1 for x in new_state[i + 1:]: if x == p: n += 1 if n >= self.catch_level and p in self.evaders: reward += self.terminal_reward self.evaders.remove(p) if len(self.evaders) == 0: self.done = True elif p in self.evaders: reward += self.ontarget_reward elif p == self.pursuers[i] and action[i] != NOOP: reward += self.bump_reward else: reward += self.move_reward return reward def build_observation(self): observations = [] for p in self.pursuers: row = p / self.n col = p % self.n o = np.zeros((3, 3), dtype=np.int8) for i in range(-1, 2): for j in range(-1, 2): if row + i < 0 or row + i >= self.m or col + j < 0 or col + j >= self.n: o[i + 1][j + 1] = -1 else: q = (row + i) * self.n + col + j if q in self.walls: o[i + 1][j + 1] = -1 elif q in self.evaders: if q in self.pursuers: o[i + 1][j + 1] = 3 else: o[i + 1][j + 1] = 1 elif q in self.pursuers: o[i + 1][j + 1] = 2 o = o.tolist() for i, e in enumerate(o): o[i] = tuple(e) observations.append(tuple(o)) return tuple(observations)
class PyColabEnv(gym.Env): metadata = { 'render.modes': ['human', 'rgb_array'], } def __init__(self, max_iterations, default_reward, action_space, act_null_value=4, delay=30, resize_scale=8, crop_window=[5, 5], render_mode='uncropped'): """Create an `PyColabEnv` adapter to a `pycolab` game as a `gym.Env`. You can access the `pycolab.Engine` instance with `env.current_game`. Args: max_iterations: maximum number of steps. default_reward: default reward if reward is None returned by the `pycolab` game. action_space: the action `Space` of the environment. delay: renderer delay. resize_scale: number of pixels per observation pixel. Used only by the renderer. crop_window: dimensions of observation cropping. render_mode: render board `cropped` or `uncropped`. """ assert max_iterations > 0 assert isinstance(default_reward, numbers.Number) self._max_iterations = max_iterations self._default_reward = default_reward # At this point, the game would only want to access the random # property, although it is set to None initially. self.np_random = None self._colors = self.make_colors() test_game = self.make_game() test_game.the_plot.info = {} observations, _, _ = test_game.its_showtime() layers = list(observations.layers.keys()) not_ordered = list(set(layers) - set(test_game.z_order)) self._render_order = list(reversed(not_ordered + test_game.z_order)) # Create the observation space. observation_layers = list(set(layers)) self._observation_order = sorted(observation_layers) self.observation_space = spaces.Box(0., 1., [len(self.state_layer_chars)] + crop_window) # don't count empty space layer self.action_space = action_space self.act_null_value = act_null_value self.current_game = None self._croppers = [] self._state = None self._last_observations = None self._last_uncropped_observations = None self._empty_board = None self._empty_uncropped_board = None self._last_painted = None self._last_uncropped_painted = None self._last_reward = None self._game_over = False self.viewer = None self.resize_scale = resize_scale self.render_mode = render_mode self.delay = delay # Metrics self.visitation_frequency = {char:0 for char in self.objects} self.first_visit_time = {char:500 for char in self.objects} self.heat_map = None @abc.abstractmethod def make_game(self): """Function that creates a new pycolab game. Returns: pycolab.Engine. """ pass def make_colors(self): """Functions that returns colors. Returns: Dictionary mapping key name to `tuple(R, G, B)`. """ return {'P' : (255., 255., 255.), 'a' : (175., 255., 15.), 'b' : (21., 0., 255.), 'c' : (0., 250., 71.), 'd' : (250., 0., 129.), 'e' : (255., 0., 0.), '#' : (61., 61., 61.), '@' : (255., 255., 0.), ' ' : (0., 0., 0.)} def _paint_board(self, layers): """Method to privately paint layers to RGB. Args: layers: a dictionary mapping a character to the respective curtain. Returns: 3D np.array (np.uint32) representing the RGB of the observation layers. """ if self.render_mode == 'uncropped': board_shape = self._last_uncropped_observations.board.shape elif self.render_mode == 'cropped': board_shape = self._last_observations.board.shape board = np.zeros(list(board_shape) + [3], np.uint32) board_mask = np.zeros(list(board_shape) + [3], np.bool) for key in self._render_order: color = self._colors.get(key, (0, 0, 0)) color = np.reshape(color, [1, 1, -1]).astype(np.uint32) # Broadcast the layer to [H, W, C]. board_layer_mask = np.array(layers[key])[..., None] board_layer_mask = np.repeat(board_layer_mask, 3, axis=-1) # Update the board with the new layer. board = np.where( np.logical_not(board_mask), board_layer_mask * color, board) # Update the mask. board_mask = np.logical_or(board_layer_mask, board_mask) return board def _update_for_game_step(self, observations, reward): """Update internal state with data from an environment interaction.""" # disentangled one hot state # update heatmap r, c = self.current_game.__dict__['_sprites_and_drapes']['P'].position self.heat_map[r, c] += 1 # update state self._state = [] for char in self.state_layer_chars: if char != ' ': mask = observations.layers[char].astype(float) if char in self.objects and 1. in mask: self.visitation_frequency[char] += 1 self._state.append(mask) self._state = np.array(self._state) # rendering purposes (RGB) self._last_observations = observations if self.render_mode == 'cropped': self._empty_board = np.zeros_like(self._last_observations.board) self._last_painted = self._paint_board(observations.layers).astype(np.float32) self._last_reward = reward if reward is not None else \ self._default_reward self._game_over = self.current_game.game_over if self.current_game.the_plot.frame >= self._max_iterations: self._game_over = True def reset(self): """Start a new episode.""" self.current_game = self.make_game() for cropper in self._croppers: cropper.set_engine(self.current_game) self._colors = self.make_colors() self.current_game.the_plot.info = {} self._game_over = None self._last_observations = None self._last_reward = None observations, reward, _ = self.current_game.its_showtime() self._last_uncropped_observations = observations self._empty_uncropped_board = np.zeros_like(self._last_uncropped_observations.board) self._last_uncropped_painted = self._paint_board(observations.layers).astype(np.float32) if len(self._croppers) > 0: observations = [cropper.crop(observations) for cropper in self._croppers][0] self.heat_map = np.zeros((self.current_game.rows, self.current_game.cols)) self._update_for_game_step(observations, reward) self.visitation_frequency = {char:0 for char in self.objects} # reset trackers return self._state def step(self, action): """Apply action, step the world forward, and return observations. Args: action: the desired action to apply to the environment. Returns: state, reward, done, info. """ if self.current_game is None: logger.warn("Episode has already ended, call `reset` instead..") state = self._last_painted reward = self._last_reward done = self._game_over return state, reward, done, {} # Execute the action in pycolab. self.current_game.the_plot.info = {} observations, reward, _ = self.current_game.play(action) self._last_uncropped_observations = observations self._empty_uncropped_board = np.zeros_like(self._last_uncropped_observations.board) self._last_uncropped_painted = self._paint_board(observations.layers).astype(np.float32) # Crop and update if len(self._croppers) > 0: observations = [cropper.crop(observations) for cropper in self._croppers][0] self._update_for_game_step(observations, reward) info = self.current_game.the_plot.info # Add custom metrics info['visitation_frequency'] = self.visitation_frequency info['first_time_visit'] = self.first_visit_time info['heat_map'] = self.heat_map # Check the current status of the game. state = self._last_painted # for rendering reward = self._last_reward done = self._game_over if self._game_over: self.current_game = None return self._state, reward, done, info def render(self, mode='rgb_array', close=False): """Render the board to an image viewer or an np.array. Args: mode: One of the following modes: - 'human': render to an image viewer. - 'rgb_array': render to an RGB np.array (np.uint8) Returns: 3D np.array (np.uint8) or a `viewer.isopen`. """ if self.render_mode == 'cropped': img = self._empty_board if self._last_observations: img = self._last_observations.board layers = self._last_observations.layers if self._colors: img = self._paint_board(layers) else: assert img is not None, '`board` must not be `None`.' elif self.render_mode == 'uncropped': img = self._empty_uncropped_board if self._last_uncropped_observations: img = self._last_uncropped_observations.board layers = self._last_uncropped_observations.layers if self._colors: img = self._paint_board(layers) else: assert img is not None, '`board` must not be `None`.' img = _repeat_axes(img, self.resize_scale, axis=[0, 1]) if len(img.shape) != 3: img = np.repeat(img[..., None], 3, axis=-1) img = img.astype(np.uint8) if mode == 'rgb_array': return img elif mode == 'human': if self.viewer is None: from gym.envs.classic_control.rendering import ( SimpleImageViewer) self.viewer = SimpleImageViewer() self.viewer.imshow(img) time.sleep(self.delay / 1e3) return self.viewer.isopen def seed(self, seed=None): """Seeds the environment. Args: seed: seed of the random engine. Returns: [seed]. """ self.np_random, seed = seeding.np_random(seed) return [seed] def close(self): """Tears down the renderer.""" if self.viewer: self.viewer.close() self.viewer = None
class GridWorld(gym.Env): """Custom gridworld Environment that follows OpenAI Gym interface. Has height x width size and n_buttons of buttons Agent has to press them in ascending order """ metadata = {'render.modes': ['human']} action2name = ['UP', 'DOWN', 'LEFT', 'RIGHT', 'PRESS'] name2action = {k: v for v, k in enumerate(action2name)} action2delta = np.array([[-1, 0], [1, 0], [0, -1], [0, 1], [0, 0]], dtype=int) def __init__(self, height=5, width=5, n_buttons=3, button_pos=None, pixels_per_tile=10, seed=None, obs_dtype='bool'): """ :param height: height of the world (in tiles) :param width: width of the world (in tiles) :param n_buttons: number of buttons :param button_pos: (optional) list of (2,) numpy arrays - positions of the buttons :param pixels_per_tile: height/width of a tile for rendering :param seed: if specified, sets this seed to numpy random :param obs_dtype: 'bool' or 'int' observation format. 'bool' for agent + one-hot encoding of buttons, 'int' for one integer for each tile """ self.action_space = spaces.Discrete(5) if obs_dtype == 'bool': self.observation_space = spaces.Box(low=0, high=1, shape=(2 * n_buttons + 1, height, width), dtype=int) else: self.observation_space = spaces.Box(low=0, high=2 * (2 * n_buttons + 1), dtype=int) self.height = height self.width = width self.n_buttons = n_buttons self.button_pos = button_pos if seed is not None: np.random.seed(seed) if self.button_pos is None: self.button_pos = [] idx = np.random.choice(height * width, n_buttons, replace=False) for index in idx: self.button_pos.append(np.array([index // width, index % width], dtype=int)) self.button_idx = tuple(idx) else: self.button_idx = tuple(a * width + b for (a, b) in button_pos) if obs_dtype not in ['bool', 'int']: raise ValueError('obs_type should be "bool" or "int"') self.obs_dtype = obs_dtype self.next_button = None self.pos = None self.viewer = SimpleImageViewer() self.pixels_per_tile = pixels_per_tile def next_pos(self, pos, action): """ Returns the next position of the agent :param pos: current position :param action: action number :return: next position """ delta = self.action2delta[action] res = pos + delta res[0] = np.clip(res[0], 0, self.height - 1) res[1] = np.clip(res[1], 0, self.width - 1) return res def get_observation(self): """ Returns an observation of the current environment state :return: if obs_type == 'bool': numpy array of shape (2 * n_buttons + 1, height, width) with values of 0 and 1 if obs_type == 'int': numpy array of shape (height, width) with values in range(2 * (2 * n_buttons + 1)) """ if self.obs_dtype == 'bool': obs = np.zeros((2 * self.n_buttons + 1, self.height, self.width), dtype=int) h, w = self.pos # Agent position channel obs[0, h, w] = 1 for ind, b_pos in enumerate(self.button_pos): h, w = b_pos if ind < self.next_button: # Pressed obs[2 * ind + 1, h, w] = 1 else: # Unpressed obs[2 * ind + 2, h, w] = 1 return obs if self.obs_dtype == 'int': obs = np.zeros((self.height, self.width), dtype=int) h, w = self.pos obs[h, w] = 2 * self.n_buttons + 1 for ind, b_pos in enumerate(self.button_pos): h, w = b_pos if ind < self.next_button: # Pressed obs[h, w] += 2 * ind + 1 else: # Unpressed obs[h, w] += 2 * ind + 2 return obs def step(self, action): """ Step function of the environment :param action: int from range(5) :return: observation - np.array (depends on obs_type) reward - float, 1. if the last button is pressed, else 0. done - bool, True if the last button is pressed info - information dict """ if self.action2name[action] == 'PRESS': if np.all(self.pos == self.button_pos[self.next_button]): self.next_button += 1 self.pos = self.next_pos(self.pos, action) obs = self.get_observation() done = (self.next_button == self.n_buttons) reward = float(done) info = self.get_info() return obs, reward, done, info def reset(self): """ Resets the environment to the initial state :return: observation of the initial state """ self.next_button = 0 self.pos = np.array([0, 0], dtype=int) return self.get_observation() def render(self, mode='human', close=False): """Used for rendering of the environment""" if close: if self.viewer is not None: self.viewer.close() self.viewer = None return if mode == 'human': grid = grid_from_state_data(self.height, self.width, self.n_buttons, self.button_pos, self.pixels_per_tile, self.pos, self.next_button) self.viewer.imshow(grid) return self.viewer.isopen elif mode == 'get_grid': return grid_from_state_data(self.height, self.width, self.n_buttons, self.button_pos, self.pixels_per_tile, self.pos, self.next_button) else: return def close(self): pass def get_expert_action(self, eps=0.05): """ Returns an action from (1 - eps) * optimal policy + eps * random policy :param eps: probability of a random action :return: action number from range(5) """ if eps and np.random.rand() < eps: return int(np.random.randint(low=0, high=5)) target = self.button_pos[self.next_button] if np.all(self.pos == target): return self.name2action['PRESS'] vert = target[0] - self.pos[0] hor = target[1] - self.pos[1] if np.random.rand() < abs(vert) / (abs(vert) + abs(hor)): # go vertical if vert > 0: action = self.name2action['DOWN'] else: action = self.name2action['UP'] else: # go horisontal if hor > 0: action = self.name2action['RIGHT'] else: action = self.name2action['LEFT'] return action def get_info(self): """ Information dict about the environment :return: dict with the 'state_tuple' key: encoding of the environment state (not used) """ info = {'state_tuple': self.button_idx + (self.next_button,) + tuple(self.pos)} return info def to_random_state(self, seed=239): """ Moves the environment to a random state (preserves button positions) :param seed: if specified, sets this seed to numpy random :return: observation of the new state """ if seed: np.random.seed(seed) self.pos[0] = np.random.randint(0, self.height) self.pos[1] = np.random.randint(0, self.width) self.next_button = np.random.randint(0, self.n_buttons) return self.get_observation() def get_all_next_states_with_data(self): """ Returns all of the possible next states and their 'state data' :return: states - observations of all states, accessible from the current one data - 'state data' of those states """ backup = self.pos.copy(), self.next_button states = [] data = [] for next_button in range(self.next_button, self.n_buttons): for pos_h in range(self.height): for pos_w in range(self.width): self.pos[0] = pos_h self.pos[1] = pos_w self.next_button = next_button states.append(self.get_observation()) data.append(self.get_state_data()) self.next_button = self.n_buttons self.pos[:] = self.button_pos[-1] states.append(self.get_observation()) data.append(self.get_state_data()) self.pos, self.next_button = backup return states, data def get_state_data(self): """ Returns the 'state data' tuple :return: heights, width, number of buttons, button positions, pixels per tile, agent position, next button number - copied from the environment """ return ( copy(self.height), copy(self.width), copy(self.n_buttons), copy(self.button_pos), copy(self.pixels_per_tile), copy(self.pos), copy(self.next_button)) def load_state_data(self, height, width, n_buttons, button_pos, pixels_per_tile, pos, next_button): """Loads environment data from the 'state data' format into the environment""" self.height = height self.width = width self.n_buttons = n_buttons self.button_pos = button_pos self.pixels_per_tile = pixels_per_tile self.pos = pos self.next_button = next_button