def __init__(self): self.ale = ALEInterface(); self.ale.loadROM(get_game_path('boxing')); self.legal_actions = self.ale.getMinimalActionSet(); self.policyModel = PolicyModel(self.legal_actions); #load model if True == os.path.exists('model'): self.policyModel.load_weights('./model/vpg_model'); self.status_size_ = 4 self.gamma_ = 1; #the reward it too small
def launch(args, defaults, description): """ Execute a complete training run. """ rec_screen = "" if "--nn-file" in args: temp_params = vars(load_params(args[args.index("--nn-file")+1])) for p in temp_params: try: vars(defaults)[p.upper()] = temp_params[p] except: print "warning: parameter", p, "from param file doesn't exist." #rec_screen = args[args.index("--nn-file")+1][:-len("last_model.pkl")]+"/frames" parameters = process_args(args, defaults, description) if parameters.rom.endswith('.bin'): rom = parameters.rom else: rom = "%s.bin" % parameters.rom parameters.rom_path = os.path.join(defaults.BASE_ROM_PATH, rom) rng = np.random.RandomState(parameters.seed) folder_name = None if parameters.folder_name == "" else parameters.folder_name ale = ALEInterface() ale.setInt('random_seed', rng.randint(parameters.seed)) ale.setBool('display_screen', parameters.display_screen) ale.setString('record_screen_dir', rec_screen) trainer = Q_Learning(model_params=parameters, ale_env=ale, folder_name=folder_name) trainer.train()
def __init__(self, rom_path, seed=123, frameskip=4, show_display=False, stack_num_states=4, concatenate_state_every=4): """ Parameters: Frameskip should be either a tuple (indicating a random range to choose from, with the top value exclude), or an int. It's aka action repeat. stack_num_states: Number of dimensions/channels to have. concatenate_state_every: After how many frames should one channel be appended to state. Number is in terms of absolute frames independent of frameskip """ self.stack_num_states = stack_num_states self.concatenate_state_every = concatenate_state_every self.game_path = rom_path if not os.path.exists(self.game_path): raise IOError('You asked for game %s but path %s does not exist' % (game, self.game_path)) self.frameskip = frameskip try: self.ale = ALEInterface() except Exception as e: print( "ALEInterface could not be loaded. ale_python_interface import failed" ) raise e # Set some default options self.ale.setInt(b'random_seed', seed) self.ale.setBool(b'sound', False) self.ale.setBool(b'display_screen', show_display) self.ale.setFloat(b'repeat_action_probability', 0.) # Load the rom self.ale.loadROM(self.game_path) (self.screen_width, self.screen_height) = self.ale.getScreenDims() self.latest_frame_fifo = deque( maxlen=2) # Holds the two closest frames to max. self.state_fifo = deque(maxlen=stack_num_states)
def __init__(self): # ale related members self.ale = ALEInterface(); self.ale.loadROM(get_game_path('boxing')); self.legal_actions = self.ale.getMinimalActionSet(); self.status = list(); # use qnet_latest to hold the latest updated weights self.qnet_latest = QNet(len(self.legal_actions)); # use qnet_target to hold the target model weights self.qnet_target = QNet(len(self.legal_actions)); if True == os.path.exists('model'): self.qnet_latest.load_weights('./model/dqn_model'); # use qnet_target as the rollout model self.qnet_target.set_weights(self.qnet_latest.get_weights()); # loss self.loss = Loss(len(self.legal_actions), self.GAMMA); # status transition memory self.memory = list(); # optimizer self.optimizer = tf.keras.optimizers.Adam(tf.keras.optimizers.schedules.ExponentialDecay(0.00025, 5 * self.SCALE, 0.96)); # episode count self.ep_count = 0;
class AtariWrapper(): """ ALE wrapper that tries to mimic the options in the DQN paper including the preprocessing (except resizing/cropping) """ action_words = [ 'NOOP', 'UP', 'RIGHT', 'LEFT', 'DOWN', "UPRIGHT", "UPLEFT", "DOWNRIGHT", "DOWNLEFT" ] _action_set = [0, 2, 3, 4, 5, 6, 7, 8, 9] # Valid actions for ALE. # Possible actions are just a list from 0,num_valid_actions # We still need to map from the latter to the former when possible_actions = list(range(len(_action_set))) def __init__(self, rom_path, seed=123, frameskip=4, show_display=False, stack_num_states=4, concatenate_state_every=4): """ Parameters: Frameskip should be either a tuple (indicating a random range to choose from, with the top value exclude), or an int. It's aka action repeat. stack_num_states: Number of dimensions/channels to have. concatenate_state_every: After how many frames should one channel be appended to state. Number is in terms of absolute frames independent of frameskip """ self.stack_num_states = stack_num_states self.concatenate_state_every = concatenate_state_every self.game_path = rom_path if not os.path.exists(self.game_path): raise IOError('You asked for game %s but path %s does not exist' % (game, self.game_path)) self.frameskip = frameskip try: self.ale = ALEInterface() except Exception as e: print( "ALEInterface could not be loaded. ale_python_interface import failed" ) raise e # Set some default options self.ale.setInt(b'random_seed', seed) self.ale.setBool(b'sound', False) self.ale.setBool(b'display_screen', show_display) self.ale.setFloat(b'repeat_action_probability', 0.) # Load the rom self.ale.loadROM(self.game_path) (self.screen_width, self.screen_height) = self.ale.getScreenDims() self.latest_frame_fifo = deque( maxlen=2) # Holds the two closest frames to max. self.state_fifo = deque(maxlen=stack_num_states) def _step(self, a, force_noop=False): """Perform one step of the environment. Automatically repeats the step self.frameskip number of times parameters: force_noop: Force it to perform a no-op ignoring the action supplied. """ assert a in self.possible_actions + [0] if force_noop: action, num_steps = 0, 1 else: action = self._action_set[a] if isinstance(self.frameskip, int): num_steps = self.frameskip else: num_steps = np.random.randint(self.frameskip[0], self.frameskip[1]) reward = 0.0 for i in range(num_steps): reward += self.ale.act(action) cur_frame = self.observe_raw(get_rgb=True) cur_frame_cropped = self.crop_frame(cur_frame) self.latest_frame_fifo.append(cur_frame_cropped) if i % self.concatenate_state_every == 0: curmax_frame = np.amax(self.latest_frame_fifo, axis=0) frame_lumi = self.convert_to_gray(curmax_frame) self.state_fifo.append(frame_lumi) # Transpose so we get HxWxC instead of CxHxW self.current_frame = np.array(np.transpose(self.state_fifo, (1, 2, 0))) self.current_frame = cv2.resize(self.current_frame, (84, 84)) return self.current_frame, reward, self.ale.game_over(), { "ale.lives": self.ale.lives() } def step(self, *args, **kwargs): """Performs one step of the environment """ lives_before = self.ale.lives() next_state, reward, done, info = self._step(*args, **kwargs) lives_after = self.ale.lives() # End the episode when a life is lost if lives_before > lives_after: done = True return next_state, reward, done, info def observe_raw(self, get_rgb=False): """Observe either RGB or Gray frames. Initialzing arrays forces it to not modify stale pointers """ if get_rgb: cur_frame_rgb = np.zeros( (self.screen_height, self.screen_width, 3), dtype=np.uint8) self.ale.getScreenRGB(cur_frame_rgb) return cur_frame_rgb else: cur_frame_gray = np.zeros((self.screen_height, self.screen_width), dtype=np.uint8) self.ale.getScreenGrayscale(cur_frame_gray) return cur_frame_gray def crop_frame(self, frame): """Simply crops a frame. Does nothing by default. """ return frame def convert_to_gray(self, img): """Get Luminescence channel """ img_f = np.float32(img) img_lumi = 0.299 * img_f[:, :, 0] + \ 0.587 * img_f[:, :, 1] + \ 0.114 * img_f[:, :, 2] return np.uint8(img_lumi) def reset(self): """Reset the game """ self.ale.reset_game() s = self.observe_raw(get_rgb=True) s = self.crop_frame(s) # Populate missing frames with blank ones. for _ in range(self.stack_num_states - 1): self.state_fifo.append(np.zeros(shape=(s.shape[0], s.shape[1]))) self.latest_frame_fifo.append(s) # Push the latest frame curmax_frame = s frame_lumi = self.convert_to_gray(s) self.state_fifo.append(frame_lumi) self.state = np.transpose(self.state_fifo, (1, 2, 0)) self.state = cv2.resize(self.state, (84, 84)) return self.state def get_action_meanings(self): """Return in text what the actions correspond to. """ return [ACTION_MEANING[i] for i in self._action_set] def save_state(self): """Saves the current state and returns a identifier to saved state """ return self.ale.cloneSystemState() def restore_state(self, ident): """Restore game state Restores the saved state of the system and perform a no-op so a new frame can be generated incase a restore is followed by an observe() """ self.ale.restoreSystemState(ident) self.step(0, force_noop=True)
def __init__(self, rand_seed, options, display=False, no_op_max=30, thread_index=-1): if options.use_gym: self._display = options.display else: self.ale = ALEInterface() self.ale.setInt(b'random_seed', rand_seed) self.ale.setFloat(b'repeat_action_probability', options.repeat_action_probability) self.ale.setInt(b'frame_skip', options.frames_skip_in_ale) self.ale.setBool(b'color_averaging', options.color_averaging_in_ale) self._no_op_max = no_op_max self.options = options self.color_maximizing = options.color_maximizing_in_gs self.color_averaging = options.color_averaging_in_gs self.color_no_change = options.color_no_change_in_gs # for screen output in _process_frame() self.thread_index = thread_index self.record_gs_screen_dir = self.options.record_gs_screen_dir self.episode_record_dir = None self.episode = 1 self.rooms = np.zeros((24), dtype=np.int) self.prev_room_no = 1 self.room_no = 1 self.new_room = -1 if options.use_gym: # see https://github.com/openai/gym/issues/349 def _seed(self, seed=None): self.ale.setFloat(b'repeat_action_probability', options.repeat_action_probability) from gym.utils import seeding self.np_random, seed1 = seeding.np_random(seed) # Derive a random seed. This gets passed as a uint, but gets # checked as an int elsewhere, so we need to keep it below # 2**31. seed2 = seeding.hash_seed(seed1 + 1) % 2 ** 31 # Empirically, we need to seed before loading the ROM. self.ale.setInt(b'random_seed', seed2) self.ale.loadROM(self.game_path) return [seed1, seed2] AtariEnv._seed = _seed self.gym = gym.make(options.gym_env) self.ale = self.gym.ale print(self.gym.action_space) else: if display: self._setup_display() self.ale.loadROM(options.rom.encode('ascii')) # collect minimal action set self.real_actions = self.ale.getMinimalActionSet() print("real_actions=", self.real_actions) if (len(self.real_actions) != self.options.action_size): print("***********************************************************") print("* action_size != len(real_actions)") print("***********************************************************") sys.exit(1) # height=210, width=160 self._screen = np.empty((210 * 160 * 1), dtype=np.uint8) if (not options.use_gym) and (self.color_maximizing or self.color_averaging or self.color_no_change): self._screen_RGB = np.empty((210 * 160 * 3), dtype=np.uint8) self._prev_screen_RGB = np.empty((210 * 160 * 3), dtype=np.uint8) self._have_prev_screen_RGB = False # for pseudo-count self.psc_use = options.psc_use if options.psc_use: psc_beta = options.psc_beta if options.psc_beta_list is not None: psc_beta = options.psc_beta_list[thread_index] psc_pow = options.psc_pow if options.psc_pow_list is not None: psc_pow = options.psc_pow_list[thread_index] print("[DIVERSITY]th={}:psc_beta={}, psc_pow={}".format(thread_index, psc_beta, psc_pow)) self.psc_frsize = options.psc_frsize self.psc_k = options.psc_frsize ** 2 self.psc_range_k = np.array([i for i in range(self.psc_k)]) self.psc_rev_pow = 1.0 / psc_pow self.psc_alpha = math.pow(0.1, psc_pow) self.psc_beta = psc_beta self.psc_maxval = options.psc_maxval if options.psc_multi: self.psc_vcount = np.zeros((24, self.psc_maxval + 1, self.psc_k), dtype=np.float64) self.psc_n = np.zeros(24, dtype=np.float64) else: self.psc_vcount = np.zeros((self.psc_maxval + 1, self.psc_k), dtype=np.float64) self.psc_n = 0 self.reset()
class GameState(object): def __init__(self, rand_seed, options, display=False, no_op_max=30, thread_index=-1): if options.use_gym: self._display = options.display else: self.ale = ALEInterface() self.ale.setInt(b'random_seed', rand_seed) self.ale.setFloat(b'repeat_action_probability', options.repeat_action_probability) self.ale.setInt(b'frame_skip', options.frames_skip_in_ale) self.ale.setBool(b'color_averaging', options.color_averaging_in_ale) self._no_op_max = no_op_max self.options = options self.color_maximizing = options.color_maximizing_in_gs self.color_averaging = options.color_averaging_in_gs self.color_no_change = options.color_no_change_in_gs # for screen output in _process_frame() self.thread_index = thread_index self.record_gs_screen_dir = self.options.record_gs_screen_dir self.episode_record_dir = None self.episode = 1 self.rooms = np.zeros((24), dtype=np.int) self.prev_room_no = 1 self.room_no = 1 self.new_room = -1 if options.use_gym: # see https://github.com/openai/gym/issues/349 def _seed(self, seed=None): self.ale.setFloat(b'repeat_action_probability', options.repeat_action_probability) from gym.utils import seeding self.np_random, seed1 = seeding.np_random(seed) # Derive a random seed. This gets passed as a uint, but gets # checked as an int elsewhere, so we need to keep it below # 2**31. seed2 = seeding.hash_seed(seed1 + 1) % 2 ** 31 # Empirically, we need to seed before loading the ROM. self.ale.setInt(b'random_seed', seed2) self.ale.loadROM(self.game_path) return [seed1, seed2] AtariEnv._seed = _seed self.gym = gym.make(options.gym_env) self.ale = self.gym.ale print(self.gym.action_space) else: if display: self._setup_display() self.ale.loadROM(options.rom.encode('ascii')) # collect minimal action set self.real_actions = self.ale.getMinimalActionSet() print("real_actions=", self.real_actions) if (len(self.real_actions) != self.options.action_size): print("***********************************************************") print("* action_size != len(real_actions)") print("***********************************************************") sys.exit(1) # height=210, width=160 self._screen = np.empty((210 * 160 * 1), dtype=np.uint8) if (not options.use_gym) and (self.color_maximizing or self.color_averaging or self.color_no_change): self._screen_RGB = np.empty((210 * 160 * 3), dtype=np.uint8) self._prev_screen_RGB = np.empty((210 * 160 * 3), dtype=np.uint8) self._have_prev_screen_RGB = False # for pseudo-count self.psc_use = options.psc_use if options.psc_use: psc_beta = options.psc_beta if options.psc_beta_list is not None: psc_beta = options.psc_beta_list[thread_index] psc_pow = options.psc_pow if options.psc_pow_list is not None: psc_pow = options.psc_pow_list[thread_index] print("[DIVERSITY]th={}:psc_beta={}, psc_pow={}".format(thread_index, psc_beta, psc_pow)) self.psc_frsize = options.psc_frsize self.psc_k = options.psc_frsize ** 2 self.psc_range_k = np.array([i for i in range(self.psc_k)]) self.psc_rev_pow = 1.0 / psc_pow self.psc_alpha = math.pow(0.1, psc_pow) self.psc_beta = psc_beta self.psc_maxval = options.psc_maxval if options.psc_multi: self.psc_vcount = np.zeros((24, self.psc_maxval + 1, self.psc_k), dtype=np.float64) self.psc_n = np.zeros(24, dtype=np.float64) else: self.psc_vcount = np.zeros((self.psc_maxval + 1, self.psc_k), dtype=np.float64) self.psc_n = 0 self.reset() # for pseudo-count def psc_set_psc_info(self, psc_info): if psc_info is not None: self.psc_vcount = np.array(psc_info["psc_vcount"], dtype=np.float64) if options.psc_multi: self.psc_n = np.array(psc_info["psc_n"], dtype=np.float64) else: self.psc_n = psc_info["psc_n"] def psc_set_gs_info(self, gs_info): self.psc_vcount = np.array(gs_info["psc_vcount"], dtype=np.float64) if options.psc_multi: self.psc_n = np.array(gs_info["psc_n"], dtype=np.float64) else: self.psc_n = gs_info["psc_n"] self.rooms = gs_info["rooms"] self.episode = gs_info["episode"] # for pseudo-count #@profile def psc_add_image(self, psc_image): if psc_image.dtype != np.dtype('uint8'): print("Internal ERROR in dtype") sys.exit(1) range_k = self.psc_range_k if options.psc_multi: room_no = self.room_no n = self.psc_n[room_no] else: n = self.psc_n if n > 0: nr = (n + 1.0)/n if options.psc_multi: vcount = self.psc_vcount[room_no, psc_image, range_k] self.psc_vcount[room_no, psc_image, range_k] += 1.0 else: vcount = self.psc_vcount[psc_image, range_k] self.psc_vcount[psc_image, range_k] += 1.0 r_over_rp = np.prod(nr * vcount / (1.0 + vcount)) dominator = 1.0 - r_over_rp if dominator <= 0.0: print("psc_add_image: dominator <= 0.0 : dominator=", dominator) dominator = 1.0e-20 psc_count = r_over_rp / dominator psc_reward = self.psc_beta / math.pow(psc_count + self.psc_alpha, self.psc_rev_pow) else: if options.psc_multi: self.psc_vcount[room_no, psc_image, range_k] += 1.0 else: self.psc_vcount[psc_image, range_k] += 1.0 psc_count = 0.0 psc_reward = self.psc_beta / math.pow(psc_count + self.psc_alpha, self.psc_rev_pow) if options.psc_multi: self.psc_n[room_no] += 1.0 else: self.psc_n += 1 if n % (self.options.score_log_interval * 10) == 0: print("[PSC]th={},psc_n={}:room={},psc_reward={:.8f},RM{:02d}".format(self.thread_index, n, self.room_no, psc_reward, self.room_no)) return psc_reward # for montezuma's revenge #@profile def update_montezuma_rooms(self): ram = self.ale.getRAM() # room_no = ram[0x83] room_no = ram[3] self.rooms[room_no] += 1 if self.rooms[room_no] == 1: print("[PSC]th={} @@@ NEW ROOM({}) VISITED: visit counts={}".format(self.thread_index, room_no, self.rooms)) self.new_room = room_no self.prev_room_no = self.room_no self.room_no = room_no def set_record_screen_dir(self, record_screen_dir): if options.use_gym: print("record_screen_dir", record_screen_dir) self.gym.monitor.start(record_screen_dir) self.reset() else: print("record_screen_dir", record_screen_dir) self.ale.setString(b'record_screen_dir', str.encode(record_screen_dir)) self.ale.loadROM(self.options.rom.encode('ascii')) self.reset() def close_record_screen_dir(self): if options.use_gym: self.gym.monitor.close() else: pass #@profile def _process_action(self, action): if options.use_gym: observation, reward, terminal, _ = self.gym.step(action) return reward, terminal else: reward = self.ale.act(action) terminal = self.ale.game_over() self.terminal = terminal self._have_prev_screen_RGB = False return reward, terminal #@profile def _process_frame(self, action, reshape): if self.terminal: reward = 0 terminal = True elif options.use_gym: observation, reward, terminal, _ = self.gym.step(action) self._screen_RGB = observation self.terminal = terminal else: # get previous screen if (self.color_maximizing or self.color_averaging) \ and not self._have_prev_screen_RGB: self.ale.getScreenRGB(self._prev_screen_RGB) self._have_prev_screen_RGB = True # make action reward = self.ale.act(action) terminal = self.ale.game_over() self.terminal = terminal # screen shape is (210, 160, 1) if self.color_maximizing or self.color_averaging: # impossible in gym self.ale.getScreenRGB(self._screen_RGB) if self._have_prev_screen_RGB: if self.color_maximizing: screen = np.maximum(self._prev_screen_RGB, self._screen_RGB) else: # self.color_averaging: screen = np.mean((self._prev_screen_RGB, self._screen_RGB), axis=0).astype(np.uint8) else: screen = self._screen_RGB screen = screen.reshape((210, 160, 3)) self._screen = cv2.cvtColor(screen, cv2.COLOR_RGB2GRAY) # swap screen_RGB swap_screen_RGB = self._prev_screen_RGB self._prev_screen_RGB = self._screen_RGB self._screen_RGB = swap_screen_RGB self._have_prev_screen_RGB = True elif self.color_no_change: if not options.use_gym: self.ale.getScreenRGB(self._screen_RGB) screen = self._screen_RGB screen = screen.reshape((210, 160, 3)) self._screen = cv2.cvtColor(screen, cv2.COLOR_RGB2GRAY) else: self.ale.getScreenGrayscale(self._screen) # reshape it into (210, 160) reshaped_screen = np.reshape(self._screen, (210, 160)) # set uncropped frame for screen output self.uncropped_screen = reshaped_screen # resize to height=110, width=84 if self.options.crop_frame: resized_screen = cv2.resize(reshaped_screen, (84, 110)) x_t = resized_screen[18:102,:] else: x_t = cv2.resize(reshaped_screen, (84, 84)) x_t_uint8 = x_t if reshape: x_t = np.reshape(x_t, (84, 84, 1)) x_t = x_t.astype(np.float32) x_t *= (1.0/255.0) return reward, terminal, x_t, x_t_uint8 #@profile def pseudo_count(self, x_t): # update covered rooms if self.options.rom == "montezuma_revenge.bin" or self.options.gym_env == "MontezumaRevenge-v0": self.update_montezuma_rooms() psc_reward = 0.0 if self.psc_use: psc_image = cv2.resize(x_t, (self.psc_frsize, self.psc_frsize)) psc_image = np.reshape(psc_image, (self.psc_k)) psc_image = np.uint8(psc_image * (self.psc_maxval / 255.0)) psc_reward = self.psc_add_image(psc_image) # update covered rooms if self.options.rom == "montezuma_revenge.bin" or self.options.gym_env == "MontezumaRevenge-v0": self.update_montezuma_rooms() return psc_reward def _setup_display(self): if sys.platform == 'darwin': import pygame pygame.init() self.ale.setBool(b'sound', False) elif sys.platform.startswith('linux'): self.ale.setBool(b'sound', True) self.ale.setBool(b'display_screen', True) def reset(self): if options.use_gym: self.gym.reset() else: self.ale.reset_game() # randomize initial state if self._no_op_max > 0: no_op = np.random.randint(0, self._no_op_max // self.options.frames_skip_in_ale + 1) if options.use_gym: no_op = no_op // 3 # gym skip 2 - 4 frame randomly for _ in range(no_op): if options.use_gym: self.gym.step(0) else: self.ale.act(0) self._have_prev_screen_RGB = False self.terminal = False _, _, x_t, x_t_uint8 = self._process_frame(0, False) _ = self.pseudo_count(x_t_uint8) self.reward = 0 self.s_t = np.stack((x_t, x_t, x_t, x_t), axis = 2) self.lives = float(self.ale.lives()) self.initial_lives = self.lives if (self.thread_index == 0) and (self.record_gs_screen_dir is not None): episode_dir = "episode{:03d}".format(self.episode) self.episode_record_dir = os.path.join(self.record_gs_screen_dir, episode_dir) os.makedirs(self.episode_record_dir) self.episode += 1 self.stepNo = 1 print("game_state: writing screen images to ", self.episode_record_dir) self.new_room = -1 #@profile def process(self, action): if options.use_gym: real_action = action if self._display: self.gym.render() else: # convert original 18 action index to minimal action set index real_action = self.real_actions[action] reward = 0 if self.options.stack_frames_in_gs: s_t1 = [] terminal = False for _ in range(self.options.frames_skip_in_gs): if not terminal: r, t, x_t1, x_t_uint8 = self._process_frame(real_action, False) reward = reward + r terminal = terminal or t s_t1.append(x_t1) self.s_t1 = np.stack(s_t1, axis = 2) # for _ in range(self.options.frames_skip_in_gs): # r, t, x_t1, x_t_uint8 = self._process_frame(real_action, True) # reward = reward + r # self.s_t1 = np.append(self.s_t[:,:,1:], x_t1, axis = 2) # if t: # break else: # altered for speed up (reduce getScreen and color_maximizing) for _ in range(self.options.frames_skip_in_gs - 1): r, t = self._process_action(real_action) reward = reward + r if t: self.terminal = True break r, t, x_t1, x_t_uint8 = self._process_frame(real_action, True) reward = reward + r self.s_t1 = np.append(self.s_t[:,:,1:], x_t1, axis = 2) self.reward = reward self.terminal = t self.psc_reward = self.pseudo_count(x_t_uint8) self.lives = float(self.ale.lives()) if self.episode_record_dir is not None: filename = "{:06d}.png".format(self.stepNo) filename = os.path.join(self.episode_record_dir, filename) self.stepNo += 1 screen_image = x_t1.reshape((84, 84)) * 255. cv2.imwrite(filename, screen_image) def update(self): self.s_t = self.s_t1
# python_example.py # Author: Ben Goodrich # # This is a direct port to python of the shared library example from # ALE provided in doc/examples/sharedLibraryInterfaceExample.cpp from __future__ import print_function import sys from random import randrange from atari_py import ALEInterface if len(sys.argv) < 2: print('Usage:', sys.argv[0], 'rom_file') sys.exit() ale = ALEInterface() # Get & Set the desired settings ale.setInt('random_seed', 123) # Set USE_SDL to true to display the screen. ALE must be compilied # with SDL enabled for this to work. On OSX, pygame init is used to # proxy-call SDL_main. USE_SDL = False if USE_SDL: if sys.platform == 'darwin': import pygame pygame.init() ale.setBool('sound', False) # Sound doesn't work on OSX elif sys.platform.startswith('linux'): ale.setBool('sound', True)
class VPG(object): def __init__(self): self.ale = ALEInterface(); self.ale.loadROM(get_game_path('boxing')); self.legal_actions = self.ale.getMinimalActionSet(); self.policyModel = PolicyModel(self.legal_actions); #load model if True == os.path.exists('model'): self.policyModel.load_weights('./model/vpg_model'); self.status_size_ = 4 self.gamma_ = 1; #the reward it too small def status2tensor(self,status): status = tf.convert_to_tensor(status, dtype = tf.float32); status = tf.transpose(status,[1,2,0]); status = tf.expand_dims(status,0); return status; def preprocess(self, image): frame = image[25:185,:,:]; frame = cv2.resize(frame,(84,84)) / 255.0; return frame; def PlayOneEpisode(self): self.ale.reset_game(); trajectory = list(); status = list(); # initial status for i in range(self.status_size_): current_frame = self.preprocess(self.ale.getScreenGrayscale()); status.append(current_frame); assert False == self.ale.game_over(); # play until game over while False == self.ale.game_over(): # display screen cv2.imshow('screen',self.ale.getScreenRGB()); cv2.waitKey(10); # choose action input = self.status2tensor(status); V, P = self.policyModel(input); action_index = tf.random.categorical(P,1); reward = 0; for i in range(self.status_size_): reward += self.ale.act(self.legal_actions[action_index]); current_frame = self.preprocess(self.ale.getScreenGrayscale()); status.append(current_frame); game_over = self.ale.game_over(); trajectory.append((status[0:self.status_size_],action_index,reward,status[1:],game_over)); status = status[1:]; total_reward = 0; for status in reversed(trajectory): total_reward = status[2] + self.gamma_ * total_reward; return trajectory, total_reward; def train(self, loop_time = 1000): optimizer = tf.keras.optimizers.Adam(1e-3); # setup checkpoint and log utils checkpoint = tf.train.Checkpoint(model = self.policyModel, optimizer = optimizer, optimizer_step = optimizer.iterations); checkpoint.restore(tf.train.latest_checkpoint('checkpoints_vpg')); log = tf.summary.create_file_writer('checkpoints_vpg'); for i in range(loop_time): trajectory, total_reward = self.PlayOneEpisode(); avg_policy_loss = tf.keras.metrics.Mean(name = 'policy loss', dtype = tf.float32); avg_value_loss = tf.keras.metrics.Mean(name = 'value loss', dtype = tf.float32); for status in trajectory: # policy loss with tf.GradientTape() as tape: Vt, Pt = self.policyModel(self.status2tensor(status[0])); Vtp1, Ptp1 = self.policyModel(self.status2tensor(status[3])); action_mask = tf.one_hot(status[1],len(self.legal_actions)); log_probs = tf.math.reduce_sum(action_mask * tf.math.log(Pt), axis = 1); advantage = -Vt + status[2] + self.gamma_ * Vtp1; policy_loss = -tf.math.reduce_mean(log_probs * advantage); value_loss = tf.math.squared_difference(Vt, total_reward); loss = policy_loss + value_loss; avg_policy_loss.update_state(policy_loss); avg_value_loss.update_state(value_loss); # write loss to summary if tf.equal(optimizer.iterations % 100, 0): with log.as_default(): tf.summary.scalar('policy loss',avg_policy_loss.result(), step = optimizer.iterations); tf.summary.scalar('value loss',avg_value_loss.result(), step = optimizer.iterations); avg_policy_loss.reset_states(); avg_value_loss.reset_states(); # train policy and value grads = tape.gradient(loss,self.policyModel.variables); optimizer.apply_gradients(zip(grads,self.policyModel.variables)); # save model every episode checkpoint.save(os.path.join('checkpoints_vpg','ckpt')); # save final model if False == os.path.exists('model'): os.mkdir('model'); #tf.saved_model.save(self.policyModel,'./model/vpg_model'); self.policyModel.save_weights('./model/vpg_model');
class DQN(object): SHOW = False; SCALE = 10000; MEMORY_LIMIT = 4 * SCALE; BATCH_SIZE = 32; BURNIN_STEP = 5 * SCALE; TRAIN_FREQUENCY = 4; UPDATE_FREQUENCY = SCALE; STATUS_SIZE = 4; GAMMA = 0.99; TEST_INTERVAL = 1000; ep_end = 0.1; ep_start = 1.; ep_end_t = MEMORY_LIMIT; learn_start = 5 * SCALE; def __init__(self): # ale related members self.ale = ALEInterface(); self.ale.loadROM(get_game_path('boxing')); self.legal_actions = self.ale.getMinimalActionSet(); self.status = list(); # use qnet_latest to hold the latest updated weights self.qnet_latest = QNet(len(self.legal_actions)); # use qnet_target to hold the target model weights self.qnet_target = QNet(len(self.legal_actions)); if True == os.path.exists('model'): self.qnet_latest.load_weights('./model/dqn_model'); # use qnet_target as the rollout model self.qnet_target.set_weights(self.qnet_latest.get_weights()); # loss self.loss = Loss(len(self.legal_actions), self.GAMMA); # status transition memory self.memory = list(); # optimizer self.optimizer = tf.keras.optimizers.Adam(tf.keras.optimizers.schedules.ExponentialDecay(0.00025, 5 * self.SCALE, 0.96)); # episode count self.ep_count = 0; def convertImgToTensor(self,status): status = tf.constant(status, dtype = tf.float32); # status.shape = (4, 48, 48) status = tf.transpose(status, (1, 2, 0)); # status.shape = (48, 48, 4) status = tf.expand_dims(status, axis = 0); # status.shape = (1, 48, 48, 4) return status; def convertBatchToTensor(self,batch): st, at, rt, stp1, et = zip(*batch); # st.shape = batchsize*[1,48,48,4] st = tf.squeeze(tf.concat(st, axis = 0)); at = tf.squeeze(tf.concat(at, axis = 0)); rt = tf.squeeze(tf.concat(rt, axis = 0)); stp1 = tf.squeeze(tf.concat(stp1, axis = 0)); et = tf.squeeze(tf.concat(et, axis = 0)); return (st,at,rt,stp1,et); def getObservation(self): image = self.ale.getScreenGrayscale(); frame = image[25:185,:,:]; frame = cv2.resize(frame,(84,84)) / 255.0; return frame; def remember(self, transition): if len(self.memory) > self.MEMORY_LIMIT: self.memory.pop(0); self.memory.append(transition); def reset_game(self): self.ale.reset_game(); self.status = list(); for i in range(self.STATUS_SIZE): current_frame = self.getObservation(); self.status.append(current_frame); assert False == self.ale.game_over(); def rollout(self): if self.ale.game_over() or len(self.status) != self.STATUS_SIZE: self.reset_game(); if self.SHOW: # display screen cv2.imshow('screen',self.ale.getScreenRGB()); cv2.waitKey(1); # choose action st = self.convertImgToTensor(self.status); Qt = self.qnet_target(st); # Qt.shape = (1, action_num) ep = self.ep_end + max(0., (self.ep_start - self.ep_end) * (self.ep_end_t - max(0., int(self.optimizer.iterations) - self.learn_start)) / self.ep_end_t); if np.random.uniform(low = 0., high = 1., size = ()) < ep: # explore at first action_index = tf.constant(np.random.randint(low = 0, high = len(self.legal_actions), size = (1,1)), dtype = tf.int64); # action_index.shape = (1, 1) else: # exploit at last action_index = tf.random.categorical(tf.keras.layers.Softmax(axis = -1)(Qt),1); # action_index.shape = (1, 1) reward = 0; for i in range(self.STATUS_SIZE): reward += self.ale.act(self.legal_actions[action_index]); self.status.append(self.getObservation()); self.status.pop(0); stp1 = self.convertImgToTensor(self.status); game_over = 1. if self.ale.game_over() else 0.; self.remember((st, action_index, float(reward), stp1, game_over)); return game_over; def train(self, loop_time = 10000000): # setup checkpoint and log utils checkpoint = tf.train.Checkpoint(model = self.qnet_target, optimizer = self.optimizer); checkpoint.restore(tf.train.latest_checkpoint('checkpoints_dqn')); log = tf.summary.create_file_writer('checkpoints_dqn'); avg_reward = tf.keras.metrics.Mean(name = 'reward', dtype = tf.float32); self.reset_game(); for i in range(loop_time): game_over = self.rollout(); if game_over: self.ep_count += 1; if game_over and self.ep_count % self.TEST_INTERVAL == 0: # evaluate the updated model for i in range(10): avg_reward.update_state(self.eval(steps = 1000)); with log.as_default(): tf.summary.scalar('reward', avg_reward.result(), step = self.optimizer.iterations); print('Step #%d Reward: %.6f lr: %.6f' % (self.optimizer.iterations, avg_reward.result(), self.optimizer._hyper['learning_rate'](self.optimizer.iterations))); avg_reward.reset_states(); # do nothing if collected samples are not enough if i < self.BURNIN_STEP or len(self.memory) < self.BATCH_SIZE: continue; # update qnet_latest at certain frequency if i % self.TRAIN_FREQUENCY == 0: avg_loss = tf.keras.metrics.Mean(name = 'loss', dtype = tf.float32); # random sample from memory batch = random.sample(self.memory, self.BATCH_SIZE); st, at, rt, stp1, et = self.convertBatchToTensor(batch); # policy loss with tf.GradientTape() as tape: Qt = self.qnet_latest(st); Qtp1 = self.qnet_latest(stp1); loss = self.loss([Qt, Qtp1, rt, at, et]); avg_loss.update_state(loss); # write loss to summary if tf.equal(self.optimizer.iterations % 100, 0): with log.as_default(): tf.summary.scalar('loss',avg_loss.result(), step = self.optimizer.iterations); avg_loss.reset_states(); # train qnet_latest grads = tape.gradient(loss,self.qnet_latest.trainable_variables); self.optimizer.apply_gradients(zip(grads,self.qnet_latest.trainable_variables)); # save model every episode if i % self.UPDATE_FREQUENCY == 0: self.qnet_target.set_weights(self.qnet_latest.get_weights()); checkpoint.save(os.path.join('checkpoints_dqn','ckpt')); # save final model if False == os.path.exists('model'): os.mkdir('model'); #tf.saved_model.save(self.qnet,'./model/vpg_model'); self.qnet_target.save_weights('./model/dqn_model'); def eval(self, steps = None): self.ale.reset_game(); # play one episode total_reward = 0; step = 0; while False == self.ale.game_over() and (steps is None or step < steps): if self.SHOW: # display screen cv2.imshow('screen',self.ale.getScreenRGB()); cv2.waitKey(1); st = self.convertImgToTensor(status); Qt = self.qnet_latest(st); action_index = tf.random.categorical(tf.keras.layers.Softmax(axis = -1)(Qt),1); for i in range(self.STATUS_SIZE): total_reward += self.ale.act(self.legal_actions[action_index]); status.append(self.getObservation()); status.pop(0); step += 1; return total_reward;
def __init__(self, rand_seed, options, display=False, no_op_max=30, thread_index=-1): if options.use_gym: self._display = options.display else: self.ale = ALEInterface() self.ale.setInt(b'random_seed', rand_seed) self.ale.setFloat(b'repeat_action_probability', options.repeat_action_probability) self.ale.setInt(b'frame_skip', options.frames_skip_in_ale) self.ale.setBool(b'color_averaging', options.color_averaging_in_ale) self._no_op_max = no_op_max self.options = options self.color_maximizing = options.color_maximizing_in_gs self.color_averaging = options.color_averaging_in_gs self.color_no_change = options.color_no_change_in_gs # for screen output in _process_frame() self.thread_index = thread_index self.record_gs_screen_dir = self.options.record_gs_screen_dir self.episode_record_dir = None self.episode = 1 self.rooms = np.zeros((24), dtype=np.int) self.prev_room_no = 1 self.room_no = 1 self.new_room = -1 if options.use_gym: # see https://github.com/openai/gym/issues/349 def _seed(self, seed=None): self.ale.setFloat(b'repeat_action_probability', options.repeat_action_probability) from gym.utils import seeding self.np_random, seed1 = seeding.np_random(seed) # Derive a random seed. This gets passed as a uint, but gets # checked as an int elsewhere, so we need to keep it below # 2**31. seed2 = seeding.hash_seed(seed1 + 1) % 2**31 # Empirically, we need to seed before loading the ROM. self.ale.setInt(b'random_seed', seed2) self.ale.loadROM(self.game_path) return [seed1, seed2] AtariEnv._seed = _seed self.gym = gym.make(options.gym_env) self.ale = self.gym.ale print(self.gym.action_space) else: if display: self._setup_display() self.ale.loadROM(options.rom.encode('ascii')) # collect minimal action set self.real_actions = self.ale.getMinimalActionSet() print("real_actions=", self.real_actions) if (len(self.real_actions) != self.options.action_size): print( "***********************************************************" ) print("* action_size != len(real_actions)") print( "***********************************************************" ) sys.exit(1) # height=210, width=160 self._screen = np.empty((210 * 160 * 1), dtype=np.uint8) if (not options.use_gym) and (self.color_maximizing or self.color_averaging or self.color_no_change): self._screen_RGB = np.empty((210 * 160 * 3), dtype=np.uint8) self._prev_screen_RGB = np.empty((210 * 160 * 3), dtype=np.uint8) self._have_prev_screen_RGB = False # for pseudo-count self.psc_use = options.psc_use if options.psc_use: self.psc_frsize = options.psc_frsize self.psc_k = options.psc_frsize**2 self.psc_rev_pow = 1.0 / options.psc_pow self.psc_alpha = math.pow(0.1, options.psc_pow) self.psc_beta = options.psc_beta self.psc_maxval = options.psc_maxval self.psc_vcount = np.zeros((self.psc_k, self.psc_maxval + 1), dtype=np.float64) self.psc_n = 0 self.reset()
class GameState(object): def __init__(self, rand_seed, options, display=False, no_op_max=30, thread_index=-1): if options.use_gym: self._display = options.display else: self.ale = ALEInterface() self.ale.setInt(b'random_seed', rand_seed) self.ale.setFloat(b'repeat_action_probability', options.repeat_action_probability) self.ale.setInt(b'frame_skip', options.frames_skip_in_ale) self.ale.setBool(b'color_averaging', options.color_averaging_in_ale) self._no_op_max = no_op_max self.options = options self.color_maximizing = options.color_maximizing_in_gs self.color_averaging = options.color_averaging_in_gs self.color_no_change = options.color_no_change_in_gs # for screen output in _process_frame() self.thread_index = thread_index self.record_gs_screen_dir = self.options.record_gs_screen_dir self.episode_record_dir = None self.episode = 1 self.rooms = np.zeros((24), dtype=np.int) self.prev_room_no = 1 self.room_no = 1 self.new_room = -1 if options.use_gym: # see https://github.com/openai/gym/issues/349 def _seed(self, seed=None): self.ale.setFloat(b'repeat_action_probability', options.repeat_action_probability) from gym.utils import seeding self.np_random, seed1 = seeding.np_random(seed) # Derive a random seed. This gets passed as a uint, but gets # checked as an int elsewhere, so we need to keep it below # 2**31. seed2 = seeding.hash_seed(seed1 + 1) % 2**31 # Empirically, we need to seed before loading the ROM. self.ale.setInt(b'random_seed', seed2) self.ale.loadROM(self.game_path) return [seed1, seed2] AtariEnv._seed = _seed self.gym = gym.make(options.gym_env) self.ale = self.gym.ale print(self.gym.action_space) else: if display: self._setup_display() self.ale.loadROM(options.rom.encode('ascii')) # collect minimal action set self.real_actions = self.ale.getMinimalActionSet() print("real_actions=", self.real_actions) if (len(self.real_actions) != self.options.action_size): print( "***********************************************************" ) print("* action_size != len(real_actions)") print( "***********************************************************" ) sys.exit(1) # height=210, width=160 self._screen = np.empty((210 * 160 * 1), dtype=np.uint8) if (not options.use_gym) and (self.color_maximizing or self.color_averaging or self.color_no_change): self._screen_RGB = np.empty((210 * 160 * 3), dtype=np.uint8) self._prev_screen_RGB = np.empty((210 * 160 * 3), dtype=np.uint8) self._have_prev_screen_RGB = False # for pseudo-count self.psc_use = options.psc_use if options.psc_use: self.psc_frsize = options.psc_frsize self.psc_k = options.psc_frsize**2 self.psc_rev_pow = 1.0 / options.psc_pow self.psc_alpha = math.pow(0.1, options.psc_pow) self.psc_beta = options.psc_beta self.psc_maxval = options.psc_maxval self.psc_vcount = np.zeros((self.psc_k, self.psc_maxval + 1), dtype=np.float64) self.psc_n = 0 self.reset() # for pseudo-count def psc_set_psc_info(self, psc_info): if psc_info["psc_n"] != 0: self.psc_vcount = np.array(psc_info["psc_vcount"], dtype=np.float64) self.psc_n = psc_info["psc_n"] def psc_set_gs_info(self, gs_info): self.psc_vcount = np.array(gs_info["psc_vcount"], dtype=np.float64) self.psc_n = gs_info["psc_n"] self.rooms = gs_info["rooms"] self.episode = gs_info["episode"] # for pseudo-count def psc_add_image(self, psc_image): if psc_image.dtype != np.dtype('uint8'): print("Internal ERROR in dtype") sys.exit(1) k = self.psc_k n = self.psc_n if n > 0: nr = (n + 1.0) / n vcount = self.psc_vcount[range(k), psc_image] self.psc_vcount[range(k), psc_image] += 1.0 r_over_rp = np.prod(nr * vcount / (1.0 + vcount)) dominator = 1.0 - r_over_rp if dominator <= 0.0: print("psc_add_image: dominator <= 0.0 : dominator=", dominator) dominator = 1.0e-20 psc_count = r_over_rp / dominator psc_reward = self.psc_beta / math.pow(psc_count + self.psc_alpha, self.psc_rev_pow) else: self.psc_vcount[range(k), psc_image] += 1.0 psc_reward = 0.0 self.psc_n += 1 if self.psc_n % (self.options.score_log_interval * 10) == 0: room = -1 if self.options.rom == "montezuma_revenge.bin" or self.options.gym_env == "MontezumaRevenge-v0": ram = self.ale.getRAM() room = ram[3] print("[PSC]th={},psc_n={}:room={},psc_reward={:.8f},RM{:02d}". format(self.thread_index, self.psc_n, room, psc_reward, self.room_no)) return psc_reward # for montezuma's revenge def update_montezuma_rooms(self): ram = self.ale.getRAM() # room_no = ram[0x83] room_no = ram[3] self.rooms[room_no] += 1 if self.rooms[room_no] == 1: print( "[PSC]th={} @@@ NEW ROOM({}) VISITED: visit counts={}".format( self.thread_index, room_no, self.rooms)) self.new_room = room_no self.prev_room_no = self.room_no self.room_no = room_no def set_record_screen_dir(self, record_screen_dir): if options.use_gym: print("record_screen_dir", record_screen_dir) self.gym.monitor.start(record_screen_dir) self.reset() else: print("record_screen_dir", record_screen_dir) self.ale.setString(b'record_screen_dir', str.encode(record_screen_dir)) self.ale.loadROM(self.options.rom.encode('ascii')) self.reset() def close_record_screen_dir(self): if options.use_gym: self.gym.monitor.close() else: pass def _process_action(self, action): if options.use_gym: observation, reward, terminal, _ = self.gym.step(action) return reward, terminal else: reward = self.ale.act(action) terminal = self.ale.game_over() self.terminal = terminal self._have_prev_screen_RGB = False return reward, terminal def _process_frame(self, action, reshape): if self.terminal: reward = 0 terminal = True elif options.use_gym: observation, reward, terminal, _ = self.gym.step(action) self._screen_RGB = observation self.terminal = terminal else: # get previous screen if (self.color_maximizing or self.color_averaging) \ and not self._have_prev_screen_RGB: self.ale.getScreenRGB(self._prev_screen_RGB) self._have_prev_screen_RGB = True # make action reward = self.ale.act(action) terminal = self.ale.game_over() self.terminal = terminal # screen shape is (210, 160, 1) if self.color_maximizing or self.color_averaging: # impossible in gym self.ale.getScreenRGB(self._screen_RGB) if self._have_prev_screen_RGB: if self.color_maximizing: screen = np.maximum(self._prev_screen_RGB, self._screen_RGB) else: # self.color_averaging: screen = np.mean((self._prev_screen_RGB, self._screen_RGB), axis=0).astype(np.uint8) else: screen = self._screen_RGB screen = screen.reshape((210, 160, 3)) self._screen = cv2.cvtColor(screen, cv2.COLOR_RGB2GRAY) # swap screen_RGB swap_screen_RGB = self._prev_screen_RGB self._prev_screen_RGB = self._screen_RGB self._screen_RGB = swap_screen_RGB self._have_prev_screen_RGB = True elif self.color_no_change: if not options.use_gym: self.ale.getScreenRGB(self._screen_RGB) screen = self._screen_RGB screen = screen.reshape((210, 160, 3)) self._screen = cv2.cvtColor(screen, cv2.COLOR_RGB2GRAY) else: self.ale.getScreenGrayscale(self._screen) # reshape it into (210, 160) reshaped_screen = np.reshape(self._screen, (210, 160)) # set uncropped frame for screen output self.uncropped_screen = reshaped_screen # resize to height=110, width=84 if self.options.crop_frame: resized_screen = cv2.resize(reshaped_screen, (84, 110)) x_t = resized_screen[18:102, :] else: x_t = cv2.resize(reshaped_screen, (84, 84)) x_t_uint8 = x_t if reshape: x_t = np.reshape(x_t, (84, 84, 1)) x_t = x_t.astype(np.float32) x_t *= (1.0 / 255.0) return reward, terminal, x_t, x_t_uint8 def pseudo_count(self, x_t): psc_reward = 0.0 if self.psc_use: psc_image = cv2.resize(x_t, (self.psc_frsize, self.psc_frsize)) psc_image = np.reshape(psc_image, (self.psc_k)) psc_image = np.uint8(psc_image * (self.psc_maxval / 255.0)) psc_reward = self.psc_add_image(psc_image) # update covered rooms if self.options.rom == "montezuma_revenge.bin" or self.options.gym_env == "MontezumaRevenge-v0": self.update_montezuma_rooms() return psc_reward def _setup_display(self): if sys.platform == 'darwin': import pygame pygame.init() self.ale.setBool(b'sound', False) elif sys.platform.startswith('linux'): self.ale.setBool(b'sound', True) self.ale.setBool(b'display_screen', True) def reset(self): if options.use_gym: self.gym.reset() else: self.ale.reset_game() # randomize initial state if self._no_op_max > 0: no_op = np.random.randint( 0, self._no_op_max // self.options.frames_skip_in_ale + 1) if options.use_gym: no_op = no_op // 3 # gym skip 2 - 4 frame randomly for _ in range(no_op): if options.use_gym: self.gym.step(0) else: self.ale.act(0) self._have_prev_screen_RGB = False self.terminal = False _, _, x_t, x_t_uint8 = self._process_frame(0, False) _ = self.pseudo_count(x_t_uint8) self.reward = 0 self.s_t = np.stack((x_t, x_t, x_t, x_t), axis=2) self.lives = float(self.ale.lives()) self.initial_lives = self.lives if (self.thread_index == 0) and (self.record_gs_screen_dir is not None): episode_dir = "episode{:03d}".format(self.episode) self.episode_record_dir = os.path.join(self.record_gs_screen_dir, episode_dir) os.makedirs(self.episode_record_dir) self.episode += 1 self.stepNo = 1 print("game_state: writing screen images to ", self.episode_record_dir) self.new_room = -1 def process(self, action): if options.use_gym: real_action = action if self._display: self.gym.render() else: # convert original 18 action index to minimal action set index real_action = self.real_actions[action] reward = 0 if self.options.stack_frames_in_gs: s_t1 = [] terminal = False for _ in range(self.options.frames_skip_in_gs): if not terminal: r, t, x_t1, x_t_uint8 = self._process_frame( real_action, False) reward = reward + r terminal = terminal or t s_t1.append(x_t1) self.s_t1 = np.stack(s_t1, axis=2) # for _ in range(self.options.frames_skip_in_gs): # r, t, x_t1, x_t_uint8 = self._process_frame(real_action, True) # reward = reward + r # self.s_t1 = np.append(self.s_t[:,:,1:], x_t1, axis = 2) # if t: # break else: # altered for speed up (reduce getScreen and color_maximizing) for _ in range(self.options.frames_skip_in_gs - 1): r, t = self._process_action(real_action) reward = reward + r if t: self.terminal = True break r, t, x_t1, x_t_uint8 = self._process_frame(real_action, True) reward = reward + r self.s_t1 = np.append(self.s_t[:, :, 1:], x_t1, axis=2) self.reward = reward self.terminal = t self.psc_reward = self.pseudo_count(x_t_uint8) self.lives = float(self.ale.lives()) if self.episode_record_dir is not None: filename = "{:06d}.png".format(self.stepNo) filename = os.path.join(self.episode_record_dir, filename) self.stepNo += 1 screen_image = x_t1.reshape((84, 84)) * 255. cv2.imwrite(filename, screen_image) def update(self): self.s_t = self.s_t1
def __init__(self, rom_file, viz=0, frame_skip=4, nullop_start=30, live_lost_as_eoe=True, max_num_frames=0): """ Args: rom_file: path to the rom frame_skip: skip every k frames and repeat the action viz: visualization to be done. Set to 0 to disable. Set to a positive number to be the delay between frames to show. Set to a string to be a directory to store frames. nullop_start: start with random number of null ops. live_losts_as_eoe: consider lost of lives as end of episode. Useful for training. max_num_frames: maximum number of frames per episode. """ super(AtariPlayer, self).__init__() assert os.path.isfile(rom_file), \ "rom {} not found. Please download at {}".format(rom_file, ROM_URL) try: ALEInterface.setLoggerMode(ALEInterface.Logger.Error) except AttributeError: print("You're not using latest ALE") # avoid simulator bugs: https://github.com/mgbellemare/Arcade-Learning-Environment/issues/86 with _ALE_LOCK: self.ale = ALEInterface() self.ale.setInt(b"random_seed", np.random.randint(0, 30000)) self.ale.setInt(b"max_num_frames_per_episode", max_num_frames) self.ale.setBool(b"showinfo", False) self.ale.setInt(b"frame_skip", 1) self.ale.setBool(b'color_averaging', False) # manual.pdf suggests otherwise. self.ale.setFloat(b'repeat_action_probability', 0.0) # viz setup if isinstance(viz, str): assert os.path.isdir(viz), viz self.ale.setString(b'record_screen_dir', viz) viz = 0 if isinstance(viz, int): viz = float(viz) self.viz = viz if self.viz and isinstance(self.viz, float): self.windowname = os.path.basename(rom_file) cv2.startWindowThread() cv2.namedWindow(self.windowname) self.ale.loadROM(rom_file.encode('utf-8')) self.width, self.height = self.ale.getScreenDims() self.actions = self.ale.getMinimalActionSet() self.live_lost_as_eoe = live_lost_as_eoe self.frame_skip = frame_skip self.nullop_start = nullop_start self.action_space = spaces.Discrete(len(self.actions)) self.observation_space = spaces.Box(low=0, high=255, shape=(self.height, self.width), dtype=np.uint8) self._restart_episode()
class AtariPlayer(gym.Env): """ A wrapper for ALE emulator, with configurations to mimic DeepMind DQN settings. Info: score: the accumulated reward in the current game gameOver: True when the current game is Over """ def __init__(self, rom_file, viz=0, frame_skip=4, nullop_start=30, live_lost_as_eoe=True, max_num_frames=0): """ Args: rom_file: path to the rom frame_skip: skip every k frames and repeat the action viz: visualization to be done. Set to 0 to disable. Set to a positive number to be the delay between frames to show. Set to a string to be a directory to store frames. nullop_start: start with random number of null ops. live_losts_as_eoe: consider lost of lives as end of episode. Useful for training. max_num_frames: maximum number of frames per episode. """ super(AtariPlayer, self).__init__() assert os.path.isfile(rom_file), \ "rom {} not found. Please download at {}".format(rom_file, ROM_URL) try: ALEInterface.setLoggerMode(ALEInterface.Logger.Error) except AttributeError: print("You're not using latest ALE") # avoid simulator bugs: https://github.com/mgbellemare/Arcade-Learning-Environment/issues/86 with _ALE_LOCK: self.ale = ALEInterface() self.ale.setInt(b"random_seed", np.random.randint(0, 30000)) self.ale.setInt(b"max_num_frames_per_episode", max_num_frames) self.ale.setBool(b"showinfo", False) self.ale.setInt(b"frame_skip", 1) self.ale.setBool(b'color_averaging', False) # manual.pdf suggests otherwise. self.ale.setFloat(b'repeat_action_probability', 0.0) # viz setup if isinstance(viz, str): assert os.path.isdir(viz), viz self.ale.setString(b'record_screen_dir', viz) viz = 0 if isinstance(viz, int): viz = float(viz) self.viz = viz if self.viz and isinstance(self.viz, float): self.windowname = os.path.basename(rom_file) cv2.startWindowThread() cv2.namedWindow(self.windowname) self.ale.loadROM(rom_file.encode('utf-8')) self.width, self.height = self.ale.getScreenDims() self.actions = self.ale.getMinimalActionSet() self.live_lost_as_eoe = live_lost_as_eoe self.frame_skip = frame_skip self.nullop_start = nullop_start self.action_space = spaces.Discrete(len(self.actions)) self.observation_space = spaces.Box(low=0, high=255, shape=(self.height, self.width), dtype=np.uint8) self._restart_episode() def get_action_meanings(self): return [ACTION_MEANING[i] for i in self.actions] def _grab_raw_image(self): """ :returns: the current 3-channel image """ m = self.ale.getScreenRGB() return m.reshape((self.height, self.width, 3)) def _current_state(self): """ returns: a gray-scale (h, w) uint8 image """ ret = self._grab_raw_image() # avoid missing frame issue: max-pooled over the last screen ret = np.maximum(ret, self.last_raw_screen) if self.viz: if isinstance(self.viz, float): cv2.imshow(self.windowname, ret) cv2.waitKey(int(self.viz * 1000)) ret = ret.astype('float32') # 0.299,0.587.0.114. same as rgb2y in torch/image ret = cv2.cvtColor(ret, cv2.COLOR_RGB2GRAY) return ret.astype('uint8') # to save some memory def _restart_episode(self): with _ALE_LOCK: self.ale.reset_game() # random null-ops start n = np.random.randint(self.nullop_start) self.last_raw_screen = self._grab_raw_image() for k in range(n): if k == n - 1: self.last_raw_screen = self._grab_raw_image() self.ale.act(0) def reset(self): if self.ale.game_over(): self._restart_episode() return self._current_state() def step(self, act): oldlives = self.ale.lives() r = 0 for k in range(self.frame_skip): if k == self.frame_skip - 1: self.last_raw_screen = self._grab_raw_image() r += self.ale.act(self.actions[act]) newlives = self.ale.lives() if self.ale.game_over() or \ (self.live_lost_as_eoe and newlives < oldlives): break isOver = self.ale.game_over() if self.live_lost_as_eoe: isOver = isOver or newlives < oldlives info = {'ale.lives': newlives} return self._current_state(), r, isOver, info