class pyrlcade_environment(object): def init(self,rom_file,ale_frame_skip): self.ale = ALEInterface() self.max_frames_per_episode = self.ale.getInt("max_num_frames_per_episode"); self.ale.set("random_seed",123) self.ale.set("disable_color_averaging",1) self.ale.set("frame_skip",ale_frame_skip) self.ale.loadROM(rom_file) self.legal_actions = self.ale.getMinimalActionSet() ram_size = self.ale.getRAMSize() self.ram = np.zeros((ram_size),dtype=np.uint8) self.ale.getRAM(self.ram) self.state = self.ale.getRAM(self.ram) def reset_state(self): self.ale.reset_game() def set_action(self,a): self.action = a def step(self): self.reward = self.ale.act(self.action) is_terminal = self.ale.game_over() return is_terminal def get_state(self): self.ale.getRAM(self.ram) return self.ram def get_reward(self): return self.reward
class pyrlcade_environment(object): def init(self, rom_file, ale_frame_skip): self.ale = ALEInterface() self.max_frames_per_episode = self.ale.getInt( "max_num_frames_per_episode") self.ale.set("random_seed", 123) self.ale.set("disable_color_averaging", 1) self.ale.set("frame_skip", ale_frame_skip) self.ale.loadROM(rom_file) self.legal_actions = self.ale.getMinimalActionSet() ram_size = self.ale.getRAMSize() self.ram = np.zeros((ram_size), dtype=np.uint8) self.ale.getRAM(self.ram) self.state = self.ale.getRAM(self.ram) def reset_state(self): self.ale.reset_game() def set_action(self, a): self.action = a def step(self): self.reward = self.ale.act(self.action) is_terminal = self.ale.game_over() return is_terminal def get_state(self): self.ale.getRAM(self.ram) return self.ram def get_reward(self): return self.reward
class AtariEnvironment(interfaces.Environment): def __init__(self, atari_rom, frame_skip=4, noop_max=30, terminate_on_end_life=False, random_seed=123, frame_history_length=4, use_gui=False, max_num_frames=500000, repeat_action_probability=0.0, record_screen_dir=None): self.ale = ALEInterface() self.ale.setInt('random_seed', random_seed) self.ale.setInt('frame_skip', 1) self.ale.setFloat('repeat_action_probability', 0.0) self.ale.setInt('max_num_frames_per_episode', max_num_frames) if record_screen_dir is not None: self.ale.setString('record_screen_dir', record_screen_dir) self.ale.loadROM(atari_rom) self.frame_skip = frame_skip self.repeat_action_probability = repeat_action_probability self.noop_max = noop_max self.terminate_on_end_life = terminate_on_end_life self.current_lives = self.ale.lives() self.is_terminal = False self.previous_action = 0 self.num_actions = len(self.ale.getMinimalActionSet()) w, h = self.ale.getScreenDims() self.screen_width = w self.screen_height = h self.zero_last_frames = [ np.zeros((84, 84), dtype=np.uint8), np.zeros((84, 84), dtype=np.uint8) ] self.last_two_frames = copy.copy(self.zero_last_frames) self.zero_history_frames = [ np.zeros((84, 84), dtype=np.uint8) for i in range(0, frame_history_length) ] self.frame_history = copy.copy(self.zero_history_frames) atari_actions = self.ale.getMinimalActionSet() self.atari_to_onehot = dict( list(zip(atari_actions, list(range(len(atari_actions)))))) self.onehot_to_atari = dict( list(zip(list(range(len(atari_actions))), atari_actions))) self.screen_image = np.zeros(self.screen_height * self.screen_width, dtype=np.uint8) self.use_gui = use_gui self.original_frame = np.zeros((h, w), dtype=np.uint8) self.refresh_time = datetime.timedelta(milliseconds=1000 / 60) self.last_refresh = datetime.datetime.now() if (self.use_gui): self.gui_screen = pygame.display.set_mode((w, h)) def getRAM(self, ram=None): return self.ale.getRAM(ram) def _get_frame(self): self.ale.getScreenGrayscale(self.screen_image) image = self.screen_image.reshape( [self.screen_height, self.screen_width, 1]) self.original_frame = image image = cv2.resize(image, (84, 84)) return image def perform_action(self, onehot_index_action): if self.repeat_action_probability > 0: if np.random.uniform() < self.repeat_action_probability: onehot_index_action = self.previous_action self.previous_action = onehot_index_action action = self.onehot_to_atari[onehot_index_action] state, action, reward, next_state, self.is_terminal = self.perform_atari_action( action) return state, onehot_index_action, reward, next_state, self.is_terminal def perform_atari_action(self, atari_action): state = self.get_current_state() reward = self._act(atari_action, self.frame_skip) if self.use_gui: self.refresh_gui() self.frame_history[:-1] = self.frame_history[1:] self.frame_history[-1] = np.max(self.last_two_frames, axis=0) next_state = self.get_current_state() return state, atari_action, reward, next_state, self.is_terminal def _act(self, ale_action, repeat): reward = 0 for i in range(repeat): reward += self.ale.act(ale_action) if i >= repeat - 2: self.last_two_frames = [ self.last_two_frames[1], self._get_frame() ] self.is_terminal = self.ale.game_over() # terminate the episode if current_lives has decreased lives = self.ale.lives() if self.current_lives != lives: if self.current_lives > lives and self.terminate_on_end_life: self.is_terminal = True self.current_lives = lives return reward def get_current_state(self): #return copy.copy(self.frame_history) return [x.copy() for x in self.frame_history] def get_actions_for_state(self, state): return [ self.atari_to_onehot[a] for a in self.ale.getMinimalActionSet() ] def reset_environment(self): self.last_two_frames = [self.zero_history_frames[0], self._get_frame()] if self.terminate_on_end_life: if self.ale.game_over(): self.ale.reset_game() else: self.ale.reset_game() self.current_lives = self.ale.lives() if self.noop_max > 0: num_noops = np.random.randint(self.noop_max + 1) self._act(0, num_noops) self.previous_action = 0 self.frame_history = copy.copy(self.zero_history_frames) self.frame_history[-1] = np.max(self.last_two_frames, axis=0) if self.use_gui: self.refresh_gui() def is_current_state_terminal(self): return self.is_terminal def refresh_gui(self): current_time = datetime.datetime.now() if (current_time - self.last_refresh) > self.refresh_time: self.last_refresh = current_time gui_image = np.tile( np.transpose(self.original_frame, axes=(1, 0, 2)), [1, 1, 3]) # gui_image = np.zeros((self.screen_width, self.screen_height, 3), dtype=np.uint8) # channel = np.random.randint(3) # gui_image[:,:,channel] = np.transpose(self.original_frame, axes=(1, 0, 2))[:,:,0] pygame.surfarray.blit_array(self.gui_screen, gui_image) pygame.display.update()
#get atari screen pixels and blit them numpy_surface = np.frombuffer(game_surface.get_buffer(),dtype=np.int32) ale.getScreenRGB(numpy_surface) logger.log(a, TYPE_ACTION, cur_time) #if cur_time %2 == 0: logger.log(numpy_surface, TYPE_SCREEN, cur_time) del numpy_surface screen.blit(pygame.transform.scale2x(game_surface),(0,0)) #get RAM ram_size = ale.getRAMSize() ram = np.zeros((ram_size),dtype=np.uint8) ale.getRAM(ram) #Display ram bytes font = pygame.font.SysFont("Ubuntu Mono",32) text = font.render("RAM: " ,1,(255,208,208)) screen.blit(text,(330,10)) font = pygame.font.SysFont("Ubuntu Mono",25) height = font.get_height()*1.2 line_pos = 40 ram_pos = 0 while(ram_pos < 128): ram_string = ''.join(["%02X "%ram[x] for x in range(ram_pos,min(ram_pos+16,128))]) text = font.render(ram_string,1,(255,255,255)) screen.blit(text,(340,line_pos))
class MsPacManGame(object): """Ms. Pac-Man Arcade Learning Environment wrapper class.""" def __init__(self, seed, display): """Constructs a MsPacManGame. Args: seed: Initial random seed, randomized when None. display: Whether to display onto the screen or not. """ self._ale = ALEInterface() if seed is None: seed = random.randint(0, 255) self._ale.setInt("random_seed", seed) if display: if sys.platform == "darwin": # Use PyGame in macOS. import pygame pygame.init() # Sound doesn't work on macOS. self._ale.setBool("sound", False) elif sys.platform.startswith("linux"): self._ale.setBool("sound", True) self._ale.setBool("display_screen", True) self._ale.loadROM("MS_PACMAN.BIN") self._reward = 0 self._raw_ms_pacman_position = (0, 0) self.__screen = self._ale.getScreen() self.__ram = self._ale.getRAM() self._lives = self._ale.lives() self._update_state() self._go_to((94, 98), 3) @property def lives(self): """Current lives remaining.""" return self._lives @property def reward(self): """Current total reward.""" return self._reward @property def map(self): """Current game map.""" return self._map @property def sliced_map(self): """Current game slice map.""" return self._sliced_map @property def ms_pacman_position(self): """Ms. PacMan's position as a map index.""" return self._ms_pacman_position @property def fruit(self): """Fruit.""" return self._fruit @property def ghosts(self): """List of ghosts.""" return self._ghosts def available_actions(self): """Returns a list of available actions to consider.""" actions = [] for action, move in [ (2, (-1, 0)), # up (3, (0, 1)), # right (4, (0, -1)), # left (5, (1, 0)) # down ]: new_pos = self.get_next_position(self._ms_pacman_position, move) if 0 <= new_pos[0] < GameMap.HEIGHT: if self._map.map[new_pos] != GameMapObjects.WALL: actions.append(action) return actions def action_to_move(self, action): return [(-1, 0), (0, 1), (0, -1), (1, 0)][action - 2] def get_next_position(self, curr_position, move): new_pos = (curr_position[0] + move[0], curr_position[1] + move[1]) if new_pos[1] < 0: new_pos = (new_pos[0], new_pos[1] + GameMap.WIDTH) elif new_pos[1] >= GameMap.WIDTH: new_pos = (new_pos[0], new_pos[1] - GameMap.WIDTH) return new_pos def act(self, action): """Plays a given action in the game. Args: action: Action to play. Returns: Partial reward gained since last action. """ m = self.action_to_move(action) next_pos = self.get_next_position(self._ms_pacman_position, m) old_reward = self._reward old_lives = self._lives expected_reward = GameMapObjects.to_reward(self._map.map[next_pos]) MAX_ACTION_COUNT = 20 for _ in range(MAX_ACTION_COUNT): if expected_reward <= 0: if self._ms_pacman_position == next_pos: break elif self._reward != old_reward: break if self.game_over() or self._lives < old_lives: return GameMapObjects.to_reward(GameMapObjects.BAD_GHOST) self._reward += self._ale.act(action) self._update_state() self._update_map() return self._reward - old_reward def _go_to(self, raw_pos, action): """Goes to a given position.""" while (abs(self._raw_ms_pacman_position[0] - raw_pos[0]) > 1 or abs(self._raw_ms_pacman_position[1] - raw_pos[1]) > 1): self._ale.act(action) self._update_state() self._update_map() def game_over(self): """Returns whether the game reached a terminal state or not.""" return self._ale.game_over() def reset_game(self): """Resets the game to the initial state.""" self._reward = 0 return self._ale.reset_game() def _to_map_position(self, pos): """Converts a RAM coordinate into a map coordinate. Args: pos: (x, y) coordinates from RAM. Returns: Map index coordinate. """ x, y = pos i = round((y - 2) / 12.0) if x < 83: j = round((x - 18) / 8.0 + 1) elif 93 < x < 169: j = round((x - 22) / 8.0 + 1) elif x > 169: j = 0 elif x < 88: j = 9 else: j = 10 return i, j def _to_raw_position(self, pos): i, j = pos y = i * 12 + 2 if j == 0: x = 12 elif j <= 9: x = (j - 1) * 8 + 18 else: x = (j - 1) * 8 + 22 return x, y def _update_state(self): """Updates the internal state of the game.""" # Get new states from RAM. self._ale.getRAM(self.__ram) new_ms_pacman_position = (int(self.__ram[10]), int(self.__ram[16])) new_ghosts_ram = [ ((int(self.__ram[6]), int(self.__ram[12])), int(self.__ram[1])), ((int(self.__ram[7]), int(self.__ram[13])), int(self.__ram[2])), ((int(self.__ram[8]), int(self.__ram[14])), int(self.__ram[3])), ((int(self.__ram[9]), int(self.__ram[15])), int(self.__ram[4])) ] fruit = (int(self.__ram[11]), int(self.__ram[17])), int(self.__ram[5]) self._fruit = Fruit.from_ram(self._to_map_position(fruit[0]), fruit[1], fruit[0][0] != 0) # Update positions. self._raw_ms_pacman_position = new_ms_pacman_position self._ms_pacman_position = self._to_map_position( new_ms_pacman_position) self._ghosts = [ Ghost.from_ram(self._to_map_position(pos), ram) for pos, ram in new_ghosts_ram ] # Update lives. self._lives = self._ale.lives() def _update_map(self): # Get new map from screen. self._ale.getScreen(self.__screen) self._map = GameMap(self.__screen.reshape(210, 160)) self._blank_map = GameMap.from_map(self._map.map.copy()) self._map.map[self._ms_pacman_position] = GameMapObjects.MS_PACMAN if self._fruit.exists: self._map.map[self._fruit.position] = GameMapObjects.FRUIT for ghost in self._ghosts: if ghost.state == Ghost.GOOD: self._map.map[ghost.position] = GameMapObjects.GOOD_GHOST elif ghost.state == Ghost.BAD: self._map.map[ghost.position] = GameMapObjects.BAD_GHOST self._sliced_map = SlicedGameMap(self._map, self._ms_pacman_position)
def main(): result = { 'name': [], 'grouped_num': [], 'distribution': [], } result_str = '' # all_game_list = ['air_raid-n', 'alien', 'amidar', 'assault', 'asterix', 'asteroids', 'atlantis'] # all_game_list = ['bank_heist', 'battle_zone', 'beam_rider', 'berzerk-n', 'bowling', 'boxing', 'breakout', 'carnival-n'] # all_game_list = ['centipede', 'chopper_command', 'crazy_climber', 'demon_attack', 'double_dunk'] # all_game_list = ['elevator_action-n', 'enduro', 'fishing_derby', 'freeway', 'frostbite', 'gopher', 'gravitar'] # all_game_list = ['hero', 'ice_hockey', 'jamesbond', 'journey_escape-n', 'kangaroo', 'krull', 'kung_fu_master'] # all_game_list = ['montezuma_revenge-n', 'ms_pacman', 'name_this_game', 'phoenix-n', 'pitfall-n', 'pong', 'pooyan-n'] # all_game_list = ['private_eye', 'qbert', 'riverraid', 'road_runner', 'robotank', 'seaquest', 'skiing-n'] # all_game_list = ['solaris-n', 'space_invaders', 'star_gunner', 'tennis', 'time_pilot', 'tutankham', 'up_n_down'] # all_game_list = ['venture', 'video_pinball', 'wizard_of_wor', 'yars_revenge-n', 'zaxxon'] # all_game_list = ['pong', 'assault','ms_pacman'] all_game_list = ['assault'] for game in all_game_list: if '-n' in game: '''games that are not in the nature DQN list''' continue import atari_py game_path = atari_py.get_game_path(game) game_path = str.encode(game_path) env = ALEInterface() env.setFloat('repeat_action_probability'.encode('utf-8'), 0.0) env.setInt(b'random_seed', 3) env.loadROM(game_path) env.reset_game() if test in ['restoreState']: state_after_reset = env.cloneState() if test in ['restoreSystemState']: state_after_reset = env.cloneSystemState() if test in ['setRAM']: ram_after_reset = env.getRAM() state_after_reset = env.cloneSystemState() ram_candidate = np.load( './stochasticity_ram_mask/{}.npy'.format(game), ) print('=====================================================') try: action_sequence = np.load( './action_sequence/action_sequence_{}_{}.npy'.format( sequence, game, )) print('action_sequence loaded') except Exception as e: '''generate a sequence of actions''' action_sequence = np.random.randint( len(env.getMinimalActionSet()), size=sequence, ) np.save( './action_sequence/action_sequence_{}_{}.npy'.format( sequence, game, ), action_sequence, ) print('action_sequence generated') print('=====================================================') bunch_obs = [] distribution = [] episode_length = -1 state_metrix = [] ram_metrix = [] for bunch_i in range(bunch): if test in ['loadROM']: env.setInt(b'random_seed', bunch_i) env.loadROM(game_path) env.reset_game() elif test in ['restoreState']: env.restoreState(state_after_reset) elif test in ['restoreSystemState']: env.restoreSystemState(state_after_reset) elif test in ['setRAM']: env.reset_game() env.restoreSystemState(state_after_reset) env.setRAM(ram_after_reset) env.setRAM(env.getRAM() * (1 - ram_candidate) + ram_candidate * (bunch_i % 255)) state_sequence = [] ram_sequence = [] has_terminated = False for sequence_i in range(sequence): for frame_skip_i in range(frame_skip): if not has_terminated: env.act(env.getMinimalActionSet()[ action_sequence[sequence_i]]) if env.game_over(): episode_length = sequence_i has_terminated = True if has_terminated: break try: clear_print('[{}|{}|{}]'.format(bunch_i, sequence_i, episode_length)) except Exception as e: pass state_sequence += [env.getScreenRGB()] ram_sequence += [process_ram(env.getRAM())] if has_terminated: break if sequence > 0: if episode_length < 0: # raise Exception('Did not terminated') print('# WARNING: Did not terminated') obs = env.getScreenRGB() state_metrix += [copy.deepcopy(state_sequence)] ram_metrix += [copy.deepcopy(ram_sequence)] if_has_identical_one = False for bunch_obs_i in range(len(bunch_obs)): max_value = np.max(np.abs(obs - bunch_obs[bunch_obs_i])) if max_value < 1: if_has_identical_one = True distribution[bunch_obs_i] += 1 break if if_has_identical_one is False: bunch_obs += [obs] distribution += [1] grouped_num = len(bunch_obs) result_str = '{}game:{} grouped_num:{} distribution:{} \n'.format( result_str, game, grouped_num, distribution, ) try: game_list += [game] except Exception as e: game_list = [game] try: grouped_num_list += [grouped_num] except Exception as e: grouped_num_list = [grouped_num] max_lenth = 0 for bunch_i in range(len(state_metrix)): if len(state_metrix[bunch_i]) > max_lenth: max_lenth = len(state_metrix[bunch_i]) for bunch_i in range(len(state_metrix)): state_metrix[bunch_i] += ([ np.zeros(shape=state_metrix[0][0].shape, dtype=state_metrix[0][0].dtype) ] * (max_lenth - len(state_metrix[bunch_i]))) ram_metrix[bunch_i] += ([ np.zeros(shape=ram_metrix[0][0].shape, dtype=ram_metrix[0][0].dtype) ] * (max_lenth - len(state_metrix[bunch_i]))) state_list = [] state_metrix_id = np.zeros((len(state_metrix), len(state_metrix[0])), dtype=int) for bunch_i in range(len(state_metrix)): for sequence_i in range(len(state_metrix[0])): found_in_state_list = False for state_list_id in range(len(state_list)): if np.max(state_list[state_list_id] - state_metrix[bunch_i][sequence_i]) < 1: state_metrix_id[bunch_i][sequence_i] = state_list_id found_in_state_list = True break if not found_in_state_list: state_list += [np.copy(state_metrix[bunch_i][sequence_i])] state_metrix_id[bunch_i][sequence_i] = (len(state_list) - 1) state_metrix_id_unsorted = np.copy(state_metrix_id) state_metrix_id = state_metrix_id.tolist() state_metrix_id.sort(key=lambda row: row[:], reverse=True) state_metrix_id = np.array(state_metrix_id) fig, ax = plt.subplots() im = ax.imshow(state_metrix_id) plt.show() plt.savefig( './results/{}_state_metrix_id.jpg'.format(game), dpi=600, ) state_metrix_figure = np.zeros( ((10 + state_metrix[0][0].shape[0]) * len(state_metrix), state_metrix[0][0].shape[1] * len(state_metrix[0]), state_metrix[0][0].shape[2]), dtype=state_metrix[0][0].dtype) ram_metrix_figure = np.zeros( ((5 + ram_metrix[0][0].shape[0]) * len(state_metrix), ram_metrix[0][0].shape[1] * len(state_metrix[0]), ram_metrix[0][0].shape[2]), dtype=ram_metrix[0][0].dtype) ram_candidate = list(range(env.getRAMSize())) for bunch_i in range(len(state_metrix)): ram_metrix_figure[((bunch_i) * (5 + ram_metrix[0][0].shape[0])):( 5 + (bunch_i) * (5 + ram_metrix[0][0].shape[0])), :, 2] = 255 for bunch_i in range(len(state_metrix)): for sequence_i in range(len(state_metrix[0])): state_metrix_figure[ (10 + (bunch_i) * (10 + state_metrix[0][0].shape[0])):(bunch_i + 1) * (10 + state_metrix[0][0].shape[0]), (sequence_i) * state_metrix[0][0].shape[1]:(sequence_i + 1) * state_metrix[0][0].shape[1]] = state_list[ state_metrix_id[bunch_i][sequence_i]] for bunch_ii in range(state_metrix_id.shape[0]): if np.max(state_metrix_id_unsorted[bunch_ii] - state_metrix_id[bunch_i]) < 1: at_unsorted_bunch = bunch_ii break ram_metrix_figure[( 5 + (bunch_i) * (5 + ram_metrix[0][0].shape[0])):(bunch_i + 1) * (5 + ram_metrix[0][0].shape[0]), (sequence_i) * ram_metrix[0][0].shape[1]:(sequence_i + 1) * ram_metrix[0][0].shape[1]] = ram_metrix[ at_unsorted_bunch][sequence_i] for bunch_i in range(len(state_metrix)): for sequence_i in range(len(state_metrix[0])): if bunch_i > 0: if state_metrix_id[bunch_i][sequence_i] != state_metrix_id[ bunch_i - 1][sequence_i]: # draw a line to seperate the bunches previous = ram_metrix_figure[( 5 + (bunch_i - 1) * (5 + ram_metrix[0][0].shape[0])):( (bunch_i) * (5 + ram_metrix[0][0].shape[0])), sequence_i, 0] later = ram_metrix_figure[( 5 + (bunch_i) * (5 + ram_metrix[0][0].shape[0])):( (bunch_i + 1) * (5 + ram_metrix[0][0].shape[0])), sequence_i, 0] delta = np.abs(previous - later) state_metrix_figure[( (bunch_i) * (10 + state_metrix[0][0].shape[0])):( 10 + (bunch_i) * (10 + state_metrix[0][0].shape[0])), (sequence_i) * state_metrix[0][0].shape[1]:, 0] = 255 ram_metrix_figure[((bunch_i) * (5 + ram_metrix[0][0].shape[0]) ):(5 + (bunch_i) * (5 + ram_metrix[0][0].shape[0])), (sequence_i) * ram_metrix[0][0].shape[1]:, 0] = 255 ram_metrix_figure[((bunch_i) * (5 + ram_metrix[0][0].shape[0]) ):(5 + (bunch_i) * (5 + ram_metrix[0][0].shape[0])), (sequence_i) * ram_metrix[0][0].shape[1]:, 1:] = 0 from PIL import Image Image.fromarray(state_metrix_figure).save( "./results/{}_state_metrix_figure.jpeg".format(game)) Image.fromarray(ram_metrix_figure.astype( state_metrix_figure.dtype)).save( "./results/{}_ram_metrix_figure.jpeg".format(game)) print(result_str) print('===============') for game_i in range(len(game_list)): print(game_list[game_i]) for grouped_num_i in range(len(grouped_num_list)): print(grouped_num_list[grouped_num_i])
st = np.ndarray(rams_0[0].shape, dtype=bool) st[:] = True temp = rams_0[0] for ram in rams_0: st = np.equal(st, (np.equal(temp, ram))) temp = ram diff = np.equal(mv, st) print(diff) for idx in range(0, diff.shape[0]): if not diff[idx]: print (idx) """ ale.reset_game() ram = ale.getRAM() ram[16] = 99 ram[10] = 66 ale.alterEmulatorRAM(ram) ale.act(1) plt.imshow(ale.getScreenRGB()) plt.show() for i in range(800): ale.act(2) plt.imshow(ale.getScreenRGB()) plt.show() for i in range(20): plt.imshow(ale.getScreenRGB()) plt.show() ale.act(4) print(ale.getRAM())
reward = ale.act(a) total_reward += reward #clear screen screen.fill((0, 0, 0)) #get atari screen pixels and blit them numpy_surface = np.frombuffer(game_surface.get_buffer(), dtype=np.int32) ale.getScreenRGB(numpy_surface) del numpy_surface screen.blit(pygame.transform.scale2x(game_surface), (0, 0)) #get RAM ram_size = ale.getRAMSize() ram = np.zeros((ram_size), dtype=np.uint8) ale.getRAM(ram) #Display ram bytes font = pygame.font.SysFont("Ubuntu Mono", 32) text = font.render("RAM: ", 1, (255, 208, 208)) screen.blit(text, (330, 10)) font = pygame.font.SysFont("Ubuntu Mono", 25) height = font.get_height() * 1.2 line_pos = 40 ram_pos = 0 while (ram_pos < 128): ram_string = ''.join( ["%02X " % ram[x] for x in range(ram_pos, min(ram_pos + 16, 128))]) text = font.render(ram_string, 1, (255, 255, 255))
def main(): result = { 'name': [], 'grouped_num': [], 'distribution': [], } result_str = '' # all_game_list = ['air_raid-n', 'alien', 'amidar', 'assault', 'asterix', 'asteroids', 'atlantis'] # all_game_list = ['bank_heist', 'battle_zone', 'beam_rider', 'berzerk-n', 'bowling', 'boxing', 'breakout', 'carnival-n'] # all_game_list = ['centipede', 'chopper_command', 'crazy_climber', 'demon_attack', 'double_dunk'] # all_game_list = ['elevator_action-n', 'enduro', 'fishing_derby', 'freeway', 'frostbite', 'gopher', 'gravitar'] # all_game_list = ['hero', 'ice_hockey', 'jamesbond', 'journey_escape-n', 'kangaroo', 'krull', 'kung_fu_master'] # all_game_list = ['montezuma_revenge-n', 'ms_pacman', 'name_this_game', 'phoenix-n', 'pitfall-n', 'pong', 'pooyan-n'] # all_game_list = ['private_eye', 'qbert', 'riverraid', 'road_runner', 'robotank', 'seaquest', 'skiing-n'] # all_game_list = ['solaris-n', 'space_invaders', 'star_gunner', 'tennis', 'time_pilot', 'tutankham', 'up_n_down'] # all_game_list = ['venture', 'video_pinball', 'wizard_of_wor', 'yars_revenge-n', 'zaxxon'] all_game_list = ['assault'] for game in all_game_list: if '-n' in game: '''games that are not in the nature DQN list''' continue import atari_py game_path = atari_py.get_game_path(game) game_path = str.encode(game_path) env = ALEInterface() env.setFloat('repeat_action_probability'.encode('utf-8'), 0.0) env.setInt(b'random_seed', 3) env.loadROM(game_path) env.reset_game() print('=====================================================') try: action_sequence = np.load( './action_sequence/action_sequence_{}_{}.npy'.format( sequence, game, )) print('action_sequence loaded') except Exception as e: '''generate a sequence of actions''' action_sequence = np.random.randint( len(env.getMinimalActionSet()), size=sequence, ) np.save( './action_sequence/action_sequence_{}_{}.npy'.format( sequence, game, ), action_sequence, ) print('action_sequence generated') print('=====================================================') state_sequence_base = [] ram_sequence_base = [] has_terminated = False for sequence_i in range(sequence): state_sequence_base += [env.getScreenRGB()] ram_sequence_base += [env.getRAM()] for frame_skip_i in range(frame_skip): if not has_terminated: env.act( env.getMinimalActionSet()[action_sequence[sequence_i]]) if env.game_over(): episode_length = sequence_i has_terminated = True if has_terminated: break if has_terminated: break if has_terminated in [False]: raise Exception('sequence length is not enough') ram_candidate = np.ones((env.getRAMSize()), dtype=np.uint8) state_sequence_branch = [] ram_sequence_branch = [] for bunch_i in range(bunch): env.setInt(b'random_seed', bunch_i) env.loadROM(game_path) env.reset_game() has_terminated = False for sequence_i in range(sequence): state_sequence_branch += [env.getScreenRGB()] ram_sequence_branch += [env.getRAM()] if sequence_i > 0: max_value = np.max( np.abs(env.getScreenRGB() - state_sequence_base[sequence_i])) if max_value > 0: delta_ram = np.sign( np.abs(ram_sequence_branch[sequence_i - 1] - ram_sequence_base[sequence_i - 1])) ram_candidate *= delta_ram remain = np.sum(ram_candidate) print('remain {} bytes'.format(remain)) if remain <= 1: if remain == 1: print(ram_candidate) np.save( './stochasticity_ram_mask/{}.npy'.format( game), ram_candidate, ) raise Exception('done') else: raise Exception('error') has_terminated = True if has_terminated: break for frame_skip_i in range(frame_skip): if not has_terminated: env.act(env.getMinimalActionSet()[ action_sequence[sequence_i]]) if env.game_over(): has_terminated = True if has_terminated: break if has_terminated: break
class KungFuMaster(object): def __init__( self, rom='/home/josema/AI/ALE/Arcade-Learning-Environment/Roms/kung_fu_master.bin', trainsessionname='test'): self.agent = None self.isAuto = True self.gui_visible = False self.userquit = False self.optimalPolicyUser = False # optimal policy set by user self.trainsessionname = trainsessionname self.elapsedtime = 0 # elapsed time for this experiment self.keys = 0 # Configuration self.pause = False # game is paused self.debug = False self.sleeptime = 0.0 self.command = 0 self.iteration = 0 self.cumreward = 0 self.cumreward100 = 0 # cum reward for statistics self.cumscore100 = 0 self.ngoalreached = 0 self.max_level = 1 self.hiscore = 0 self.hireward = -1000000 self.resfile = open("data/" + self.trainsessionname + ".dat", "a+") self.legal_actions = 0 self.rom = rom self.key_status = [] def init(self, agent): # init after creation (uses args set from cli) self.ale = ALEInterface() self.ale.setInt('random_seed', 123) ram_size = self.ale.getRAMSize() self.ram = np.zeros((ram_size), dtype=np.uint8) if (self.gui_visible): os.environ['SDL_VIDEO_CENTERED'] = '1' if sys.platform == 'darwin': pygame.init() self.ale.setBool('sound', False) # Sound doesn't work on OSX elif sys.platform.startswith('linux'): pygame.init() self.ale.setBool('sound', True) self.ale.setBool('display_screen', False) self.ale.loadROM(self.rom) self.legal_actions = self.ale.getLegalActionSet() if (self.gui_visible): (self.screen_width, self.screen_height) = self.ale.getScreenDims() print("width/height: " + str(self.screen_width) + "/" + str(self.screen_height)) (display_width, display_height) = (1024, 420) self.screen = pygame.display.set_mode( (display_width, display_height)) pygame.display.set_caption( "Reinforcement Learning - Sapienza - Jose M Salas") self.numpy_surface = np.zeros( (self.screen_height, self.screen_width, 3), dtype=np.uint8) self.game_surface = pygame.Surface( (self.screen_width, self.screen_height)) pygame.display.flip() #init clock self.clock = pygame.time.Clock() self.agent = agent self.nactions = len( self.legal_actions ) # 0: not moving, 1: left, 2: right, 3: up, 4: down for i in range(self.nactions): self.key_status.append(False) print(self.nactions) # ns = 89999 # Number of statuses if we use enemy type ram info without level number #FINAL ns = 489999 # Number of statuses if we use enemy type ram info ns = 4899999 # Number of statuses if we use enemy type ram info # ns = 48999 print('Number of states: %d' % ns) self.agent.init(ns, self.nactions) # 1 for RA not used here def initScreen(self): if (self.gui_visible): if sys.platform == 'darwin': pygame.init() self.ale.setBool('sound', False) # Sound doesn't work on OSX elif sys.platform.startswith('linux'): pygame.init() self.ale.setBool('sound', True) self.ale.setBool('display_screen', False) if (self.gui_visible): (self.screen_width, self.screen_height) = self.ale.getScreenDims() print("width/height: " + str(self.screen_width) + "/" + str(self.screen_height)) (display_width, display_height) = (1024, 420) self.screen = pygame.display.set_mode( (display_width, display_height)) pygame.display.set_caption( "Reinforcement Learning - Sapienza - Jose M Salas") self.numpy_surface = np.zeros( (self.screen_height, self.screen_width, 3), dtype=np.uint8) self.game_surface = pygame.Surface( (self.screen_width, self.screen_height)) pygame.display.flip() #init clock self.clock = pygame.time.Clock() def reset(self): self.pos_x = 0 self.pos_y = 0 # Kung fu master observations self.enemy_pos = 0 self.n_enemies = 0 self.my_pos = 0 self.danger_pos = 0 self.danger_type = 0 self.enemy_type = 0 # 0, 1, 2, 3, 80, 81, 82, 40 self.blocked = 0 self.prev_blocked = 0 self.hold_hit = 0 self.time_left1 = 0 self.time_left2 = 0 self.my_energy = 39 self.previous_my_energy = 39 self.lifes = 3 self.previous_lifes = 3 self.got_hit = 0 self.got_blocked = 0 self.got_unblocked = 0 self.still_blocked = False self.starting_pos = 0 self.level = 1 self.score = 0 self.cumreward = 0 self.cumscore = 0 self.action_reward = 0 self.current_reward = 0 # accumulate reward over all events happened during this action until next different state self.prev_state = None # previous state self.firstAction = True # first action of the episode self.finished = False # episode finished self.newstate = True # new state reached self.numactions = 0 # number of actions in this episode self.iteration += 1 self.agent.optimal = self.optimalPolicyUser or ( self.iteration % 100 ) == 0 # False #(random.random() < 0.5) # choose greedy action selection for the entire episode def pair_function(self): # Combine the number of enemies, player blocked and danger type information into 7 different states if self.n_enemies > 0: self.danger_type = 0 # print (str(self.n_enemies) + " - " + str(self.danger_type) + ' - ' + str(self.blocked)) pair = (int)( (0.5 * (self.n_enemies + self.danger_type) * (self.n_enemies + self.danger_type + 1) + self.danger_type + 1) * (1 - (self.blocked / 128))) if pair > 8: return 5 #game not started yet else: return pair def enemy_type_s(self): if self.enemy_type > 127: return (self.enemy_type - 128 + 4) elif self.enemy_type == 64: return 8 else: return self.enemy_type def getstate(self): # print ('enemy type: ' + str(self.enemy_type_s()) + 'level: ' + str(self.level -1) ) x = (int)((self.level - 1) * 1000000 + self.pair_function() * 100000 + (self.enemy_type_s() * 10000) + np.rint(self.my_pos / 32) * 1000 + np.rint(self.enemy_pos / 32) * 100 + np.rint(self.danger_pos / 32) * 10 + np.rint(self.hold_hit / 16)) #3FINAL x = (int)((self.enemy_type_s()*1000) + (self.level-1)*100000 + self.pair_function()*10000 + np.rint(self.enemy_pos/32)*100 + np.rint(self.danger_pos/32)*10 + np.rint(self.hold_hit/16)) #2NO LEVEL x = (int)((self.enemy_type_s()*1000) + self.pair_function()*10000 + np.rint(self.enemy_pos/32)*100 + np.rint(self.danger_pos/32)*10 + np.rint(self.hold_hit/16)) #1NO ENEMY TYPE x = (int)((self.level-1)*10000 + self.pair_function()*1000 + np.rint(self.enemy_pos/32)*100 + np.rint(self.danger_pos/32)*10 + np.rint(self.hold_hit/16)) return x def goal_reached(self): #return (self.my_energy>0 and self.time_left1==0 and self.time_left2<5) #and self.my_energy==39) return (self.level == 5) def update(self, a): self.command = a # Update RAM self.ale.getRAM(self.ram) # Get info from RAM self.enemy_pos = self.ram[72] self.n_enemies = self.ram[91] self.danger_pos = self.ram[73] self.my_pos = self.ram[74] self.hold_hit = self.ram[77] self.enemy_type = self.ram[54] if self.level < self.ram[31]: self.starting_pos = self.ram[74] self.level = self.ram[31] self.max_level = max(self.level, self.max_level) # Danger/Enemy position: # 49 = no danger # 50 = danger approaching from left # 208 = danger approaching from right # ram[96] = 6, danger comes from top # ram[96] = 29, danger comes from bottom # ram[96] = 188, none if self.ram[96] == 6: self.danger_type = 0 elif self.ram[96] == 29: self.danger_type = 1 else: self.danger_type = 2 self.time_left1 = self.ram[27] self.time_left2 = self.ram[28] self.previous_my_energy = self.my_energy self.my_energy = self.ram[75] if self.my_energy < self.previous_my_energy and not self.still_blocked and self.ram[ 34] == 0: self.got_hit = STATES['GotHit'] else: self.got_hit = 0 self.previous_lifes = self.lifes self.lifes = self.ram[29] self.prev_blocked = self.blocked self.blocked = self.ram[61] if self.blocked > self.prev_blocked and not self.still_blocked: self.got_blocked = STATES['GotBlocked'] self.still_blocked = True self.got_unblocked = 0 elif self.blocked < self.prev_blocked and self.still_blocked: self.got_unblocked = STATES['GotUnblocked'] self.still_blocked = False self.got_blocked = 0 else: self.got_blocked = 0 self.got_unblocked = 0 # print ('enemy_pos=' +str(self.enemy_pos) + ' - danger_pos=' + str(self.danger_pos) + ' - my_position=' # + str(self.my_pos) + ' - my_energy=' + str(self.my_energy) + ' - blocked=' + str(self.blocked) + ' - danger_type=' + str(self.danger_type)) self.prev_state = self.getstate() # remember previous state # print " == Update start ",self.prev_state," action",self.command self.current_reward = 0 # accumulate reward over all events happened during this action until next different state #print('self.current_reward = 0') self.numactions += 1 # total number of actions axecuted in this episode # while (self.prev_state == self.getstate()): if (self.firstAction): self.starting_pos = self.ram[74] self.firstAction = False self.current_reward = self.ale.act(a) else: self.current_reward = self.ale.act(a) if self.ram[34] == 0: #only when playing if (a == 3 and self.starting_pos < self.my_pos) or ( a == 4 and self.starting_pos > self.my_pos): self.action_reward = STATES['MoveFW'] elif (a == 3 and self.starting_pos > self.my_pos) or ( a == 4 and self.starting_pos < self.my_pos): self.action_reward = STATES['MoveBW'] else: self.action_reward = STATES['NotMoving'] self.score += self.current_reward self.current_reward += self.action_reward # print('score= ' + str(self.score) + ' current reward=' +str(np.rint(self.current_reward))+ ' - energy=' + str(self.my_energy/39.0) + # ' - got_hot='+ str(self.got_hit) + ' - got_blocked=' + str(self.got_blocked) + ' - got_unblocked=' + str(self.got_unblocked)) # check if episode terminated #self.draw_screen if self.goal_reached(): self.current_reward += STATES['Alive'] self.ngoalreached += 1 #self.ale.reset_game() self.finished = True if (self.ale.game_over()): self.current_reward += STATES['Dead'] if self.level > 1: print('game over in level ' + str(self.level)) if self.my_energy > 0 and self.lifes == 3: print('Game over alive????') self.ale.reset_game() self.finished = True if self.level > 2: if self.gui_visible == False: self.gui_visible = True self.initScreen() #print " ** Update end ",self.getstate(), " prev ",self.prev_state def input(self): self.isPressed = False if self.gui_visible: for event in pygame.event.get(): if event.type == pygame.QUIT: return False if event.type == pygame.KEYDOWN: if event.key == pygame.K_SPACE: self.pause = not self.pause print "Game paused: ", self.pause elif event.key == pygame.K_a: self.isAuto = not self.isAuto self.sleeptime = int(self.isAuto) * 0.07 elif event.key == pygame.K_s: self.sleeptime = 1.0 self.agent.debug = False elif event.key == pygame.K_d: self.sleeptime = 0.07 self.agent.debug = False elif event.key == pygame.K_f: self.sleeptime = 0.005 self.agent.debug = False elif event.key == pygame.K_g: self.sleeptime = 0.0 self.agent.debug = False elif event.key == pygame.K_o: self.optimalPolicyUser = not self.optimalPolicyUser print "Best policy: ", self.optimalPolicyUser elif event.key == pygame.K_q: self.userquit = True print "User quit !!!" else: pressed = pygame.key.get_pressed() self.keys = 0 self.keys |= pressed[pygame.K_UP] self.keys |= pressed[pygame.K_DOWN] << 1 self.keys |= pressed[pygame.K_LEFT] << 2 self.keys |= pressed[pygame.K_RIGHT] << 3 self.keys |= pressed[pygame.K_z] << 4 self.command = key_action_tform_table[self.keys] self.key_status[self.command] = True if event.type == pygame.KEYUP: pressed = pygame.key.get_pressed() self.keys = 0 self.keys |= pressed[pygame.K_UP] self.keys |= pressed[pygame.K_DOWN] << 1 self.keys |= pressed[pygame.K_LEFT] << 2 self.keys |= pressed[pygame.K_RIGHT] << 3 self.keys |= pressed[pygame.K_z] << 4 self.command = key_action_tform_table[self.keys] self.key_status[self.command] = False if not (True in self.key_status): self.command = 0 return True def getUserAction(self): return self.command def getreward(self): r = np.rint( self.current_reward ) + self.got_hit + self.got_blocked + self.got_unblocked - np.rint( self.blocked / 128) self.cumreward += r return r def print_report(self, printall=False): toprint = printall ch = ' ' if (self.agent.optimal): ch = '*' toprint = True s = 'Iter %6d, sc: %3d, l: %d, na: %4d, r: %5d %c' % ( self.iteration, self.score, self.level, self.numactions, self.cumreward, ch) if self.score > self.hiscore: self.hiscore = self.score s += ' HISCORE ' toprint = True if self.cumreward > self.hireward: self.hireward = self.cumreward s += ' HIREWARD ' toprint = True if (toprint): print(s) self.cumreward100 += self.cumreward self.cumscore100 += self.score numiter = 100 if (self.iteration % numiter == 0): #self.doSave() pgoal = float(self.ngoalreached * 100) / numiter print( '----------------------------------------------------------------------------------------------------------------------' ) print( "%s %6d avg last 100: reward %d | score %.2f | level %d | p goals %.1f %%" % (self.trainsessionname, self.iteration, self.cumreward100 / 100, float(self.cumscore100) / 100, self.max_level, pgoal)) print( '----------------------------------------------------------------------------------------------------------------------' ) self.cumreward100 = 0 self.cumscore100 = 0 self.ngoalreached = 0 sys.stdout.flush() self.resfile.write( "%d,%d,%d,%d\n" % (self.score, self.cumreward, self.goal_reached(), self.numactions)) self.resfile.flush() def draw(self): if self.gui_visible: self.screen.fill((0, 0, 0)) self.ale.getScreenRGB(self.numpy_surface) pygame.surfarray.blit_array( self.game_surface, np.transpose(self.numpy_surface, (1, 0, 2))) # pygame.pixelcopy.array_to_surface(self.game_surface, np.transpose(self.numpy_surface,(1,0,2))) self.screen.blit( pygame.transform.scale2x( pygame.transform.scale( self.game_surface, (self.screen_height, self.screen_height))), (0, 0)) #Display ram bytes font = pygame.font.SysFont("Ubuntu Mono", 32) text = font.render("RAM: ", 1, (255, 208, 208)) self.screen.blit(text, (430, 10)) font = pygame.font.SysFont("Ubuntu Mono", 25) height = font.get_height() * 1.2 line_pos = 40 ram_pos = 0 while (ram_pos < 128): ram_string = ''.join([ "%02X " % self.ram[x] for x in range(ram_pos, min(ram_pos + 16, 128)) ]) text = font.render(ram_string, 1, (255, 255, 255)) self.screen.blit(text, (440, line_pos)) line_pos += height ram_pos += 16 #display current action font = pygame.font.SysFont("Ubuntu Mono", 32) text = font.render("Current Action: " + str(self.command), 1, (208, 208, 255)) height = font.get_height() * 1.2 self.screen.blit(text, (430, line_pos)) line_pos += height #display reward font = pygame.font.SysFont("Ubuntu Mono", 30) text = font.render("Total Reward: " + str(self.cumreward), 1, (208, 255, 255)) self.screen.blit(text, (430, line_pos)) pygame.display.flip() # clock.tick(60.) else: return 0 def quit(self): self.resfile.close() pygame.quit()
[152, 41], [175, 41], [180, 29], [203, 29], [231, 16], [231, 41], [175, 65], [180, 53], [203, 53], [147, 77], [120, 93], [152, 65], [231, 65], [175, 93], [97, 93], [180, 77], [231, 93], [180, 105], [147, 105], [203, 77], [175, 77], [175, 117], [231, 117], [203, 129], [203, 105], [180, 129], [231, 141], [152, 117], [124, 77], [124, 105], [152, 93]] learning = [] # Limits action set to UP RIGHT LEFT DOWN actions of ALE environment actions = range(2, 6) # Starts the learning episodes for episode in range(episodes): total_reward = 0 sup_reward = 0 action = 0 rewards = [] ram = ale.getRAM() Q = 0 last_action = 0 last_Q = 0 last_features = NUM_FEATURES * [rd.random()] BLUE_PIX = [45, 87, 176] YEL_PIX = [210, 210, 64] # Starts iterations of episode for time in range(max_time): # Get bert pos in RAM B_POS = [ram[33], ram[43]] # Get number of lives remaining LIVES = ale.lives() # last_ram = ram ram = ale.getRAM() screen = ale.getScreenRGB()