class AtariWrapper(): """ ALE wrapper that tries to mimic the options in the DQN paper including the preprocessing (except resizing/cropping) """ action_words = [ 'NOOP', 'UP', 'RIGHT', 'LEFT', 'DOWN', "UPRIGHT", "UPLEFT", "DOWNRIGHT", "DOWNLEFT" ] _action_set = [0, 2, 3, 4, 5, 6, 7, 8, 9] #Valid actions for ALE. #Possible actions are just a list from 0,num_valid_actions #We still need to map from the latter to the former when possible_actions = list(range(len(_action_set))) def __init__(self, rom_path, seed=123, frameskip=4, show_display=False, stack_num_states=4, concatenate_state_every=4): """ Parameters: Frameskip should be either a tuple (indicating a random range to choose from, with the top value exclude), or an int. It's aka action repeat. stack_num_states: Number of dimensions/channels to have. concatenate_state_every: After how many frames should one channel be appended to state. Number is in terms of absolute frames independent of frameskip """ self.stack_num_states = stack_num_states self.concatenate_state_every = concatenate_state_every self.game_path = rom_path if not os.path.exists(self.game_path): raise IOError('You asked for game %s but path %s does not exist' % (game, self.game_path)) self.frameskip = frameskip try: self.ale = ALEInterface() except Exception as e: print( "ALEInterface could not be loaded. ale_python_interface import failed" ) raise e #Set some default options self.ale.setInt(b'random_seed', seed) self.ale.setBool(b'sound', False) self.ale.setBool(b'display_screen', show_display) self.ale.setFloat(b'repeat_action_probability', 0.) #Load the rom self.ale.loadROM(self.game_path) (self.screen_width, self.screen_height) = self.ale.getScreenDims() self.latest_frame_fifo = deque( maxlen=2) #Holds the two closest frames to max. self.state_fifo = deque(maxlen=stack_num_states) def _step(self, a, force_noop=False): """Perform one step of the environment. Automatically repeats the step self.frameskip number of times parameters: force_noop: Force it to perform a no-op ignoring the action supplied. """ assert a in self.possible_actions + [0] if force_noop: action, num_steps = 0, 1 else: action = self._action_set[a] if isinstance(self.frameskip, int): num_steps = self.frameskip else: num_steps = np.random.randint(self.frameskip[0], self.frameskip[1]) reward = 0.0 for i in range(num_steps): reward += self.ale.act(action) cur_frame = self.observe_raw(get_rgb=True) cur_frame_cropped = self.crop_frame(cur_frame) self.latest_frame_fifo.append(cur_frame_cropped) if i % self.concatenate_state_every == 0: curmax_frame = np.amax(self.latest_frame_fifo, axis=0) frame_lumi = self.convert_to_gray(curmax_frame) self.state_fifo.append(frame_lumi) #Transpose so we get HxWxC instead of CxHxW self.current_frame = np.array(np.transpose(self.state_fifo, (1, 2, 0))) return self.current_frame, reward, self.ale.game_over(), { "ale.lives": self.ale.lives() } def step(self, *args, **kwargs): """Performs one step of the environment """ lives_before = self.ale.lives() next_state, reward, done, info = self._step(*args, **kwargs) lives_after = self.ale.lives() # End the episode when a life is lost if lives_before > lives_after: done = True return next_state, reward, done, info def observe_raw(self, get_rgb=False): """Observe either RGB or Gray frames. Initialzing arrays forces it to not modify stale pointers """ if get_rgb: cur_frame_rgb = np.zeros( (self.screen_height, self.screen_width, 3), dtype=np.uint8) self.ale.getScreenRGB(cur_frame_rgb) return cur_frame_rgb else: cur_frame_gray = np.zeros((self.screen_height, self.screen_width), dtype=np.uint8) self.ale.getScreenGrayscale(cur_frame_gray) return cur_frame_gray def crop_frame(self, frame): """Simply crops a frame. Does nothing by default. """ return frame def convert_to_gray(self, img): """Get Luminescence channel """ img_f = np.float32(img) img_lumi = 0.299*img_f[:,:,0] + \ 0.587*img_f[:,:,1] + \ 0.114*img_f[:,:,2] return np.uint8(img_lumi) def reset(self): """Reset the game """ self.ale.reset_game() s = self.observe_raw(get_rgb=True) s = self.crop_frame(s) #Populate missing frames with blank ones. for _ in range(self.stack_num_states - 1): self.state_fifo.append(np.zeros(shape=(s.shape[0], s.shape[1]))) self.latest_frame_fifo.append(s) #Push the latest frame curmax_frame = s frame_lumi = self.convert_to_gray(s) self.state_fifo.append(frame_lumi) self.state = np.transpose(self.state_fifo, (1, 2, 0)) return self.state def get_action_meanings(self): """Return in text what the actions correspond to. """ return [ACTION_MEANING[i] for i in self._action_set] def save_state(self): """Saves the current state and returns a identifier to saved state """ return self.ale.cloneSystemState() def restore_state(self, ident): """Restore game state Restores the saved state of the system and perform a no-op so a new frame can be generated incase a restore is followed by an observe() """ self.ale.restoreSystemState(ident) self.step(0, force_noop=True)
class Environment: """docstring for Environment""" BUFFER_LEN = 2 EPISODE_FRAMES = 18000 EPOCH_COUNT = 200 EPOCH_STEPS = 250000 EVAL_EPS = 0.001 FRAMES_SKIP = 4 FRAME_HEIGHT = 84 FRAME_WIDTH = 84 MAX_NO_OP = 30 MAX_REWARD = 1 def __init__(self, rom_name, rng, display_screen = False): self.api = ALEInterface() self.api.setInt('random_seed', rng.randint(333)) self.api.setBool('display_screen', display_screen) self.api.setFloat('repeat_action_probability', 0.0) self.rom_name = rom_name self.display_screen = display_screen self.rng = rng self.repeat = Environment.FRAMES_SKIP self.buffer_len = Environment.BUFFER_LEN self.height = Environment.FRAME_HEIGHT self.width = Environment.FRAME_WIDTH self.episode_steps = Environment.EPISODE_FRAMES / Environment.FRAMES_SKIP self.merge_id = 0 self.max_reward = Environment.MAX_REWARD self.eval_eps = Environment.EVAL_EPS self.log_dir = '' self.network_dir = '' self.api.loadROM('../rom/' + self.rom_name) self.minimal_actions = self.api.getMinimalActionSet() original_width, original_height = self.api.getScreenDims() self.merge_frame = np.zeros((self.buffer_len , original_height , original_width) , dtype = np.uint8) def get_action_count(self): return len(self.minimal_actions) def train(self, agent, store_freq, folder = None, start_epoch = 0): self._open_log_files(agent, folder) obs = np.zeros((self.height, self.width), dtype = np.uint8) epoch_count = Environment.EPOCH_COUNT for epoch in xrange(start_epoch, epoch_count): self.need_reset = True steps_left = Environment.EPOCH_STEPS print "\n" + "=" * 50 print "Epoch #%d" % (epoch + 1) episode = 0 train_start = time.time() while steps_left > 0: num_step, _ = self._run_episode(agent, steps_left, obs) steps_left -= num_step episode += 1 if steps_left == 0 or episode % 10 == 0: print "Finished episode #%d, steps_left = %d" \ % (episode, steps_left) train_end = time.time() valid_values = agent.get_validate_values() eval_values = self.evaluate(agent) test_end = time.time() train_time = train_end - train_start test_time = test_end - train_end step_per_sec = Environment.EPOCH_STEPS * 1. / max(1, train_time) print "\tFinished epoch #%d, episode trained = %d\n" \ "\tValidate values = %.3f, evaluate reward = %.3f\n"\ "\tTrain time = %.0fs, test time = %.0fs, steps/sec = %.4f" \ % (epoch + 1, episode, valid_values, eval_values\ , train_time, test_time, step_per_sec) self._update_log_files(agent, epoch + 1, episode , valid_values, eval_values , train_time, test_time , step_per_sec, store_freq) gc.collect() def evaluate(self, agent, episodes = 30, obs = None): print "\n***Start evaluating" if obs is None: obs = np.zeros((self.height, self.width), dtype = np.uint8) sum_reward = 0.0 sum_step = 0.0 for episode in xrange(episodes): self.need_reset = True step, reward = self._run_episode(agent, self.episode_steps, obs , self.eval_eps, evaluating = True) sum_reward += reward sum_step += step print "Finished episode %d, reward = %d, step = %d" \ % (episode + 1, reward, step) self.need_reset = True print "Average reward per episode = %.4f" % (sum_reward / episodes) print "Average step per episode = %.4f" % (sum_step / episodes) return sum_reward / episodes def _prepare_game(self): if self.need_reset or self.api.game_over(): self.api.reset_game() self.need_reset = False if Environment.MAX_NO_OP > 0: num_no_op = self.rng.randint(Environment.MAX_NO_OP + 1) \ + self.buffer_len for _ in xrange(num_no_op): self.api.act(0) for _ in xrange(self.buffer_len): self._update_buffer() def _run_episode(self, agent, steps_left, obs , eps = 0.0, evaluating = False): self._prepare_game() start_lives = self.api.lives() step_count = 0 sum_reward = 0 is_terminal = False while step_count < steps_left and not is_terminal: self._get_screen(obs) action_id, _ = agent.get_action(obs, eps, evaluating) reward = self._repeat_action(self.minimal_actions[action_id]) reward_clip = reward if self.max_reward > 0: reward_clip = np.clip(reward, -self.max_reward, self.max_reward) life_lost = not evaluating and self.api.lives() < start_lives is_terminal = self.api.game_over() or life_lost \ or step_count + 1 >= steps_left agent.add_experience(obs, is_terminal, action_id, reward_clip , evaluating) sum_reward += reward step_count += 1 return step_count, sum_reward def _update_buffer(self): self.api.getScreenGrayscale(self.merge_frame[self.merge_id, ...]) self.merge_id = (self.merge_id + 1) % self.buffer_len def _repeat_action(self, action): reward = 0 for i in xrange(self.repeat): reward += self.api.act(action) if i + self.buffer_len >= self.repeat: self._update_buffer() return reward def _get_screen(self, resized_frame): self._resize_frame(self.merge_frame.max(axis = 0), resized_frame) def _resize_frame(self, src_frame, dst_frame): cv2.resize(src = src_frame, dst = dst_frame, dsize = (self.width, self.height), interpolation = cv2.INTER_LINEAR) def _open_log_files(self, agent, folder): time_str = time.strftime("_%m-%d-%H-%M", time.localtime()) base_rom_name = os.path.splitext(os.path.basename(self.rom_name))[0] if folder is not None: self.log_dir = folder self.network_dir = self.log_dir + '/network' else: self.log_dir = '../run_results/' + base_rom_name + time_str self.network_dir = self.log_dir + '/network' info_name = get_next_name(self.log_dir, 'info', 'txt') git_name = get_next_name(self.log_dir, 'git-diff', '') try: os.stat(self.log_dir) except OSError: os.makedirs(self.log_dir) try: os.stat(self.network_dir) except OSError: os.makedirs(self.network_dir) with open(os.path.join(self.log_dir, info_name), 'w') as f: f.write('Commit: ' + subprocess.check_output(['git', 'rev-parse' , 'HEAD'])) f.write('Run command: ') f.write(' '.join(pipes.quote(x) for x in sys.argv)) f.write('\n\n') f.write(agent.get_info()) write_info(f, Environment) write_info(f, agent.__class__) write_info(f, agent.network.__class__) # From https://github.com/spragunr/deep_q_rl/pull/49/files with open(os.path.join(self.log_dir, git_name), 'w') as f: f.write(subprocess.check_output(['git', 'diff', 'HEAD'])) if folder is not None: return with open(os.path.join(self.log_dir, 'results.csv'), 'w') as f: f.write("epoch,episode_train,validate_values,evaluate_reward"\ ",train_time,test_time,steps_per_second\n") mem = psutil.virtual_memory() with open(os.path.join(self.log_dir, 'memory.csv'), 'w') as f: f.write("epoch,available,free,buffers,cached"\ ",available_readable,used_percent\n") f.write("%d,%d,%d,%d,%d,%s,%.1f\n" % \ (0, mem.available, mem.free, mem.buffers, mem.cached , bytes2human(mem.available), mem.percent)) def _update_log_files(self, agent, epoch, episode, valid_values , eval_values, train_time, test_time, step_per_sec , store_freq): print "Updating log files" with open(self.log_dir + '/results.csv', 'a') as f: f.write("%d,%d,%.4f,%.4f,%d,%d,%.4f\n" % \ (epoch, episode, valid_values, eval_values , train_time, test_time, step_per_sec)) mem = psutil.virtual_memory() with open(self.log_dir + '/memory.csv', 'a') as f: f.write("%d,%d,%d,%d,%d,%s,%.1f\n" % \ (epoch, mem.available, mem.free, mem.buffers, mem.cached , bytes2human(mem.available), mem.percent)) agent.dump_network(self.network_dir + ('/%03d' % (epoch)) + '.npz') if (store_freq >= 0 and epoch >= Environment.EPOCH_COUNT) or \ (store_freq > 0 and (epoch % store_freq == 0)): agent.dump_exp(self.network_dir + '/exp.npz') def _setup_record(self, network_file): file_name, _ = os.path.splitext(os.path.basename(network_file)) time_str = time.strftime("_%m-%d-%H-%M", time.localtime()) img_dir = os.path.dirname(network_file) + '/images_' \ + file_name + time_str rom_name, _ = os.path.splitext(self.rom_name) out_name = os.path.dirname(network_file) + '/' + rom_name + '_' \ + file_name + time_str + '.mov' print out_name try: os.stat(img_dir) except OSError: os.makedirs(img_dir) self.api.setString('record_screen_dir', img_dir) self.api.loadROM('../rom/' + self.rom_name) return img_dir, out_name def record_run(self, agent, network_file, episode_id = 1): if episode_id > 1: self.evaluate(agent, episode_id - 1) system_state = self.api.cloneSystemState() img_dir, out_name = self._setup_record(network_file) if episode_id > 1: self.api.restoreSystemState(system_state) self.evaluate(agent, 1) script = \ """ { ffmpeg -r 60 -i %s/%%06d.png -f mov -c:v libx264 %s } || { avconv -r 60 -i %s/%%06d.png -f mov -c:v libx264 %s } """ % (img_dir, out_name, img_dir, out_name) os.system(script)
def main(): result = { 'name': [], 'grouped_num': [], 'distribution': [], } result_str = '' # all_game_list = ['air_raid-n', 'alien', 'amidar', 'assault', 'asterix', 'asteroids', 'atlantis'] # all_game_list = ['bank_heist', 'battle_zone', 'beam_rider', 'berzerk-n', 'bowling', 'boxing', 'breakout', 'carnival-n'] # all_game_list = ['centipede', 'chopper_command', 'crazy_climber', 'demon_attack', 'double_dunk'] # all_game_list = ['elevator_action-n', 'enduro', 'fishing_derby', 'freeway', 'frostbite', 'gopher', 'gravitar'] # all_game_list = ['hero', 'ice_hockey', 'jamesbond', 'journey_escape-n', 'kangaroo', 'krull', 'kung_fu_master'] # all_game_list = ['montezuma_revenge-n', 'ms_pacman', 'name_this_game', 'phoenix-n', 'pitfall-n', 'pong', 'pooyan-n'] # all_game_list = ['private_eye', 'qbert', 'riverraid', 'road_runner', 'robotank', 'seaquest', 'skiing-n'] # all_game_list = ['solaris-n', 'space_invaders', 'star_gunner', 'tennis', 'time_pilot', 'tutankham', 'up_n_down'] # all_game_list = ['venture', 'video_pinball', 'wizard_of_wor', 'yars_revenge-n', 'zaxxon'] # all_game_list = ['pong', 'assault','ms_pacman'] all_game_list = ['assault'] for game in all_game_list: if '-n' in game: '''games that are not in the nature DQN list''' continue import atari_py game_path = atari_py.get_game_path(game) game_path = str.encode(game_path) env = ALEInterface() env.setFloat('repeat_action_probability'.encode('utf-8'), 0.0) env.setInt(b'random_seed', 3) env.loadROM(game_path) env.reset_game() if test in ['restoreState']: state_after_reset = env.cloneState() if test in ['restoreSystemState']: state_after_reset = env.cloneSystemState() if test in ['setRAM']: ram_after_reset = env.getRAM() state_after_reset = env.cloneSystemState() ram_candidate = np.load( './stochasticity_ram_mask/{}.npy'.format(game), ) print('=====================================================') try: action_sequence = np.load( './action_sequence/action_sequence_{}_{}.npy'.format( sequence, game, )) print('action_sequence loaded') except Exception as e: '''generate a sequence of actions''' action_sequence = np.random.randint( len(env.getMinimalActionSet()), size=sequence, ) np.save( './action_sequence/action_sequence_{}_{}.npy'.format( sequence, game, ), action_sequence, ) print('action_sequence generated') print('=====================================================') bunch_obs = [] distribution = [] episode_length = -1 state_metrix = [] ram_metrix = [] for bunch_i in range(bunch): if test in ['loadROM']: env.setInt(b'random_seed', bunch_i) env.loadROM(game_path) env.reset_game() elif test in ['restoreState']: env.restoreState(state_after_reset) elif test in ['restoreSystemState']: env.restoreSystemState(state_after_reset) elif test in ['setRAM']: env.reset_game() env.restoreSystemState(state_after_reset) env.setRAM(ram_after_reset) env.setRAM(env.getRAM() * (1 - ram_candidate) + ram_candidate * (bunch_i % 255)) state_sequence = [] ram_sequence = [] has_terminated = False for sequence_i in range(sequence): for frame_skip_i in range(frame_skip): if not has_terminated: env.act(env.getMinimalActionSet()[ action_sequence[sequence_i]]) if env.game_over(): episode_length = sequence_i has_terminated = True if has_terminated: break try: clear_print('[{}|{}|{}]'.format(bunch_i, sequence_i, episode_length)) except Exception as e: pass state_sequence += [env.getScreenRGB()] ram_sequence += [process_ram(env.getRAM())] if has_terminated: break if sequence > 0: if episode_length < 0: # raise Exception('Did not terminated') print('# WARNING: Did not terminated') obs = env.getScreenRGB() state_metrix += [copy.deepcopy(state_sequence)] ram_metrix += [copy.deepcopy(ram_sequence)] if_has_identical_one = False for bunch_obs_i in range(len(bunch_obs)): max_value = np.max(np.abs(obs - bunch_obs[bunch_obs_i])) if max_value < 1: if_has_identical_one = True distribution[bunch_obs_i] += 1 break if if_has_identical_one is False: bunch_obs += [obs] distribution += [1] grouped_num = len(bunch_obs) result_str = '{}game:{} grouped_num:{} distribution:{} \n'.format( result_str, game, grouped_num, distribution, ) try: game_list += [game] except Exception as e: game_list = [game] try: grouped_num_list += [grouped_num] except Exception as e: grouped_num_list = [grouped_num] max_lenth = 0 for bunch_i in range(len(state_metrix)): if len(state_metrix[bunch_i]) > max_lenth: max_lenth = len(state_metrix[bunch_i]) for bunch_i in range(len(state_metrix)): state_metrix[bunch_i] += ([ np.zeros(shape=state_metrix[0][0].shape, dtype=state_metrix[0][0].dtype) ] * (max_lenth - len(state_metrix[bunch_i]))) ram_metrix[bunch_i] += ([ np.zeros(shape=ram_metrix[0][0].shape, dtype=ram_metrix[0][0].dtype) ] * (max_lenth - len(state_metrix[bunch_i]))) state_list = [] state_metrix_id = np.zeros((len(state_metrix), len(state_metrix[0])), dtype=int) for bunch_i in range(len(state_metrix)): for sequence_i in range(len(state_metrix[0])): found_in_state_list = False for state_list_id in range(len(state_list)): if np.max(state_list[state_list_id] - state_metrix[bunch_i][sequence_i]) < 1: state_metrix_id[bunch_i][sequence_i] = state_list_id found_in_state_list = True break if not found_in_state_list: state_list += [np.copy(state_metrix[bunch_i][sequence_i])] state_metrix_id[bunch_i][sequence_i] = (len(state_list) - 1) state_metrix_id_unsorted = np.copy(state_metrix_id) state_metrix_id = state_metrix_id.tolist() state_metrix_id.sort(key=lambda row: row[:], reverse=True) state_metrix_id = np.array(state_metrix_id) fig, ax = plt.subplots() im = ax.imshow(state_metrix_id) plt.show() plt.savefig( './results/{}_state_metrix_id.jpg'.format(game), dpi=600, ) state_metrix_figure = np.zeros( ((10 + state_metrix[0][0].shape[0]) * len(state_metrix), state_metrix[0][0].shape[1] * len(state_metrix[0]), state_metrix[0][0].shape[2]), dtype=state_metrix[0][0].dtype) ram_metrix_figure = np.zeros( ((5 + ram_metrix[0][0].shape[0]) * len(state_metrix), ram_metrix[0][0].shape[1] * len(state_metrix[0]), ram_metrix[0][0].shape[2]), dtype=ram_metrix[0][0].dtype) ram_candidate = list(range(env.getRAMSize())) for bunch_i in range(len(state_metrix)): ram_metrix_figure[((bunch_i) * (5 + ram_metrix[0][0].shape[0])):( 5 + (bunch_i) * (5 + ram_metrix[0][0].shape[0])), :, 2] = 255 for bunch_i in range(len(state_metrix)): for sequence_i in range(len(state_metrix[0])): state_metrix_figure[ (10 + (bunch_i) * (10 + state_metrix[0][0].shape[0])):(bunch_i + 1) * (10 + state_metrix[0][0].shape[0]), (sequence_i) * state_metrix[0][0].shape[1]:(sequence_i + 1) * state_metrix[0][0].shape[1]] = state_list[ state_metrix_id[bunch_i][sequence_i]] for bunch_ii in range(state_metrix_id.shape[0]): if np.max(state_metrix_id_unsorted[bunch_ii] - state_metrix_id[bunch_i]) < 1: at_unsorted_bunch = bunch_ii break ram_metrix_figure[( 5 + (bunch_i) * (5 + ram_metrix[0][0].shape[0])):(bunch_i + 1) * (5 + ram_metrix[0][0].shape[0]), (sequence_i) * ram_metrix[0][0].shape[1]:(sequence_i + 1) * ram_metrix[0][0].shape[1]] = ram_metrix[ at_unsorted_bunch][sequence_i] for bunch_i in range(len(state_metrix)): for sequence_i in range(len(state_metrix[0])): if bunch_i > 0: if state_metrix_id[bunch_i][sequence_i] != state_metrix_id[ bunch_i - 1][sequence_i]: # draw a line to seperate the bunches previous = ram_metrix_figure[( 5 + (bunch_i - 1) * (5 + ram_metrix[0][0].shape[0])):( (bunch_i) * (5 + ram_metrix[0][0].shape[0])), sequence_i, 0] later = ram_metrix_figure[( 5 + (bunch_i) * (5 + ram_metrix[0][0].shape[0])):( (bunch_i + 1) * (5 + ram_metrix[0][0].shape[0])), sequence_i, 0] delta = np.abs(previous - later) state_metrix_figure[( (bunch_i) * (10 + state_metrix[0][0].shape[0])):( 10 + (bunch_i) * (10 + state_metrix[0][0].shape[0])), (sequence_i) * state_metrix[0][0].shape[1]:, 0] = 255 ram_metrix_figure[((bunch_i) * (5 + ram_metrix[0][0].shape[0]) ):(5 + (bunch_i) * (5 + ram_metrix[0][0].shape[0])), (sequence_i) * ram_metrix[0][0].shape[1]:, 0] = 255 ram_metrix_figure[((bunch_i) * (5 + ram_metrix[0][0].shape[0]) ):(5 + (bunch_i) * (5 + ram_metrix[0][0].shape[0])), (sequence_i) * ram_metrix[0][0].shape[1]:, 1:] = 0 from PIL import Image Image.fromarray(state_metrix_figure).save( "./results/{}_state_metrix_figure.jpeg".format(game)) Image.fromarray(ram_metrix_figure.astype( state_metrix_figure.dtype)).save( "./results/{}_ram_metrix_figure.jpeg".format(game)) print(result_str) print('===============') for game_i in range(len(game_list)): print(game_list[game_i]) for grouped_num_i in range(len(grouped_num_list)): print(grouped_num_list[grouped_num_i])
class aleForET: def __init__(self, rom_file, screen, rndseed, resume_state_file=None): # When you might pass None to screen: # You are not interested in running any functions that displays graphics # For example, you should only run proceed_one_step__fast__no_scr_support() # Otherwise, those functions uses self.screen and you will get a RuntimeError if screen != None: pygame.init() self.screen = screen GAME_W, GAME_H = 160, 210 self.size = GAME_W * V.xSCALE, GAME_H * V.ySCALE # Get & Set the desired settings self.ale = ALEInterface() self.ale.setInt("random_seed", rndseed) self.ale.setBool('sound', False) self.ale.setBool('display_screen', False) self.ale.setBool('color_averaging', COLOR_AVG) self.ale.setFloat('repeat_action_probability', 0.0) # Load the ROM file self.ale.loadROM(rom_file) self.gamename = os.path.basename(rom_file).split('.')[0] self.clock = pygame.time.Clock() self._last_time = time.time() self.score = 0 self.episode = 0 self.frame_cnt = 0 # Get the list of legal actions self.legal_actions = self.ale.getLegalActionSet() if resume_state_file: self.loadALEState(resume_state_file) def saveALEState(self, fname): basedir = os.path.dirname(fname) if not os.path.exists(basedir): os.makedirs(basedir) pALEState = self.ale.cloneSystemState( ) # actually it returns an int, a memory address pointing to a C++ object ALEState serialized_np = self.ale.encodeState( pALEState) # this func actually takes a pointer np.savez(fname, state=serialized_np, score=self.score, episode=self.episode) def loadALEState(self, fname): npzfile = np.load(fname) serialized_np = npzfile['state'] self.score = npzfile['score'] self.episode = npzfile['episode'] pALEState = self.ale.decodeState( serialized_np ) # actually it returns an int, a memory address pointing to a C++ object ALEState self.ale.restoreSystemState( pALEState) # this func actually takes a pointer def proceed_one_step(self, action, refresh_screen=False, fps_limit=0, model_gaze_output=None, gc_window_drawer_func=None): self.clock.tick( fps_limit) # control FPS. fps_limit == 0 means no limit self.frame_cnt += 1 # Display FPS diff_time = time.time() - self._last_time if diff_time > 1.0: print 'FPS: %.1f' % self.clock.get_fps() self._last_time = time.time() # Show game image cur_frame_np = self.ale.getScreenRGB() if refresh_screen: cur_frame_Surface = pygame.surfarray.make_surface(cur_frame_np) cur_frame_Surface = pygame.transform.flip(cur_frame_Surface, True, False) cur_frame_Surface = pygame.transform.rotate(cur_frame_Surface, 90) # Perform scaling directly on screen, leaving cur_frame_Surface unscaled. # Slightly faster than scaling cur_frame_Surface and then transfer to screen. pygame.transform.scale(cur_frame_Surface, self.size, self.screen) if gc_window_drawer_func != None and model_gaze_output: gc_window_drawer_func(self.screen, model_gaze_output) pygame.display.flip() # Apply an action and get the resulting reward reward = self.ale.act(action) self.score += reward return cur_frame_np, reward, self.check_episode_end_and_if_true_reset_game( ) def proceed_one_step__fast__no_scr_support(self, action): self.frame_cnt += 1 cur_frame_np = self.ale.getScreenRGB() reward = self.ale.act(action) self.score += reward return cur_frame_np, reward, self.check_episode_end_and_if_true_reset_game( ) def check_episode_end_and_if_true_reset_game(self): end = self.ale.game_over() if end: print 'Episode', self.episode, 'ended with score:', self.score self.score = 0 self.episode += 1 self.ale.reset_game() return end # after reset_game(), ale.game_over()'s return value will change to false def run(self, gc_window_drawer_func=None, save_screen_func=None, event_handler_func=None, record_a_and_r_func=None): self.run_start_time = time.time() # used in alerecord_main.py while True: self.check_episode_end_and_if_true_reset_game() self.clock.tick(FRAME_RATE) # control FPS self.frame_cnt += 1 key = pygame.key.get_pressed() if event_handler_func != None: stop, eyelink_err_code, bool_drawgc = event_handler_func( key, self) if stop: return eyelink_err_code # Display FPS diff_time = time.time() - self._last_time if diff_time > 1.0: print 'FPS: %.1f' % self.clock.get_fps() self._last_time = time.time() # Show game image cur_frame_np = self.ale.getScreenRGB() cur_frame_Surface = pygame.surfarray.make_surface(cur_frame_np) cur_frame_Surface = pygame.transform.flip(cur_frame_Surface, True, False) cur_frame_Surface = pygame.transform.rotate(cur_frame_Surface, 90) # Perform scaling directly on screen, leaving cur_frame_Surface unscaled. # Slightly faster than scaling cur_frame_Surface and then transfer to screen. pygame.transform.scale(cur_frame_Surface, self.size, self.screen) if gc_window_drawer_func != None and bool_drawgc: gc_window_drawer_func(self.screen) pygame.display.flip() # Save frame to disk (160*210, i.e. not scaled; because this is faster) if save_screen_func != None: save_screen_func(cur_frame_Surface, self.frame_cnt) # Apply an action and get the resulting reward a_index = aenum.action_map(key, self.gamename) a = self.legal_actions[a_index] reward = self.ale.act(a) self.score += reward if record_a_and_r_func != None: record_a_and_r_func(a, reward, self.episode, self.score) pygame.event.pump() # need this line to get new key pressed assert False, "Returning should only happen in the while True loop" def run_in_step_by_step_mode(self, gc_window_drawer_func=None, save_screen_func=None, event_handler_func=None, record_a_and_r_func=None): bool_drawgc = False self.run_start_time = time.time() # used in alerecord_main.py while True: self.check_episode_end_and_if_true_reset_game() # Get game image cur_frame_np = self.ale.getScreenRGB() cur_frame_Surface = pygame.surfarray.make_surface(cur_frame_np) cur_frame_Surface = pygame.transform.flip(cur_frame_Surface, True, False) cur_frame_Surface = pygame.transform.rotate(cur_frame_Surface, 90) self.frame_cnt += 1 # Save frame to disk (160*210, i.e. not scaled; because this is faster) if save_screen_func != None: save_screen_func(cur_frame_Surface, self.frame_cnt) key, draw_next_game_frame = None, False while not draw_next_game_frame: self.clock.tick(FRAME_RATE) # control FPS key = pygame.key.get_pressed() if event_handler_func != None: stop, eyelink_err_code, bool_drawgc = event_handler_func( key, self) if stop: return eyelink_err_code a_index = aenum.action_map(key, self.gamename) # Not in all cases when action_map returns "NO OP" is the real action "NO OP", # Only when the human press "TAB", is the real action "NO OP". if (a_index == aenum.PLAYER_A_NOOP and key[pygame.K_TAB]) \ or a_index != aenum.PLAYER_A_NOOP: draw_next_game_frame = True # Draw the image onto screen. # Perform scaling directly on screen, leaving cur_frame_Surface unscaled. pygame.transform.scale(cur_frame_Surface, self.size, self.screen) if gc_window_drawer_func != None and bool_drawgc: gc_window_drawer_func(self.screen) pygame.display.flip() pygame.event.pump() # need this line to get new key pressed # Apply an action and get the resulting reward a = self.legal_actions[a_index] reward = self.ale.act(a) self.score += reward if record_a_and_r_func != None: record_a_and_r_func(a, reward, self.episode, self.score) assert False, "Returning code should only be in the while True loop"
class Environment: """docstring for Environment""" BUFFER_LEN = 2 EPISODE_FRAMES = 18000 EPOCH_COUNT = 200 EPOCH_STEPS = 250000 EVAL_EPS = 0.001 FRAMES_SKIP = 4 FRAME_HEIGHT = 84 FRAME_WIDTH = 84 MAX_NO_OP = 30 MAX_REWARD = 1 def __init__(self, rom_name, rng, display_screen=False): self.api = ALEInterface() self.api.setInt('random_seed', rng.randint(333)) self.api.setBool('display_screen', display_screen) self.api.setFloat('repeat_action_probability', 0.0) self.rom_name = rom_name self.display_screen = display_screen self.rng = rng self.repeat = Environment.FRAMES_SKIP self.buffer_len = Environment.BUFFER_LEN self.height = Environment.FRAME_HEIGHT self.width = Environment.FRAME_WIDTH self.episode_steps = Environment.EPISODE_FRAMES / Environment.FRAMES_SKIP self.merge_id = 0 self.max_reward = Environment.MAX_REWARD self.eval_eps = Environment.EVAL_EPS self.log_dir = '' self.network_dir = '' self.api.loadROM('../rom/' + self.rom_name) self.minimal_actions = self.api.getMinimalActionSet() original_width, original_height = self.api.getScreenDims() self.merge_frame = np.zeros( (self.buffer_len, original_height, original_width), dtype=np.uint8) def get_action_count(self): return len(self.minimal_actions) def train(self, agent, store_freq, folder=None, start_epoch=0): self._open_log_files(agent, folder) obs = np.zeros((self.height, self.width), dtype=np.uint8) epoch_count = Environment.EPOCH_COUNT for epoch in xrange(start_epoch, epoch_count): self.need_reset = True steps_left = Environment.EPOCH_STEPS print "\n" + "=" * 50 print "Epoch #%d" % (epoch + 1) episode = 0 train_start = time.time() while steps_left > 0: num_step, _ = self._run_episode(agent, steps_left, obs) steps_left -= num_step episode += 1 if steps_left == 0 or episode % 10 == 0: print "Finished episode #%d, steps_left = %d" \ % (episode, steps_left) train_end = time.time() valid_values = agent.get_validate_values() eval_values = self.evaluate(agent) test_end = time.time() train_time = train_end - train_start test_time = test_end - train_end step_per_sec = Environment.EPOCH_STEPS * 1. / max(1, train_time) print "\tFinished epoch #%d, episode trained = %d\n" \ "\tValidate values = %.3f, evaluate reward = %.3f\n"\ "\tTrain time = %.0fs, test time = %.0fs, steps/sec = %.4f" \ % (epoch + 1, episode, valid_values, eval_values\ , train_time, test_time, step_per_sec) self._update_log_files(agent, epoch + 1, episode, valid_values, eval_values, train_time, test_time, step_per_sec, store_freq) gc.collect() def evaluate(self, agent, episodes=30, obs=None): print "\n***Start evaluating" if obs is None: obs = np.zeros((self.height, self.width), dtype=np.uint8) sum_reward = 0.0 sum_step = 0.0 for episode in xrange(episodes): self.need_reset = True step, reward = self._run_episode(agent, self.episode_steps, obs, self.eval_eps, evaluating=True) sum_reward += reward sum_step += step print "Finished episode %d, reward = %d, step = %d" \ % (episode + 1, reward, step) self.need_reset = True print "Average reward per episode = %.4f" % (sum_reward / episodes) print "Average step per episode = %.4f" % (sum_step / episodes) return sum_reward / episodes def _prepare_game(self): if self.need_reset or self.api.game_over(): self.api.reset_game() self.need_reset = False if Environment.MAX_NO_OP > 0: num_no_op = self.rng.randint(Environment.MAX_NO_OP + 1) \ + self.buffer_len for _ in xrange(num_no_op): self.api.act(0) for _ in xrange(self.buffer_len): self._update_buffer() def _run_episode(self, agent, steps_left, obs, eps=0.0, evaluating=False): self._prepare_game() start_lives = self.api.lives() step_count = 0 sum_reward = 0 is_terminal = False while step_count < steps_left and not is_terminal: self._get_screen(obs) action_id, _ = agent.get_action(obs, eps, evaluating) reward = self._repeat_action(self.minimal_actions[action_id]) reward_clip = reward if self.max_reward > 0: reward_clip = np.clip(reward, -self.max_reward, self.max_reward) life_lost = not evaluating and self.api.lives() < start_lives is_terminal = self.api.game_over() or life_lost \ or step_count + 1 >= steps_left agent.add_experience(obs, is_terminal, action_id, reward_clip, evaluating) sum_reward += reward step_count += 1 return step_count, sum_reward def _update_buffer(self): self.api.getScreenGrayscale(self.merge_frame[self.merge_id, ...]) self.merge_id = (self.merge_id + 1) % self.buffer_len def _repeat_action(self, action): reward = 0 for i in xrange(self.repeat): reward += self.api.act(action) if i + self.buffer_len >= self.repeat: self._update_buffer() return reward def _get_screen(self, resized_frame): self._resize_frame(self.merge_frame.max(axis=0), resized_frame) def _resize_frame(self, src_frame, dst_frame): cv2.resize(src=src_frame, dst=dst_frame, dsize=(self.width, self.height), interpolation=cv2.INTER_LINEAR) def _open_log_files(self, agent, folder): time_str = time.strftime("_%m-%d-%H-%M", time.localtime()) base_rom_name = os.path.splitext(os.path.basename(self.rom_name))[0] if folder is not None: self.log_dir = folder self.network_dir = self.log_dir + '/network' else: self.log_dir = '../run_results/' + base_rom_name + time_str self.network_dir = self.log_dir + '/network' info_name = get_next_name(self.log_dir, 'info', 'txt') git_name = get_next_name(self.log_dir, 'git-diff', '') try: os.stat(self.log_dir) except OSError: os.makedirs(self.log_dir) try: os.stat(self.network_dir) except OSError: os.makedirs(self.network_dir) with open(os.path.join(self.log_dir, info_name), 'w') as f: f.write('Commit: ' + subprocess.check_output(['git', 'rev-parse', 'HEAD'])) f.write('Run command: ') f.write(' '.join(pipes.quote(x) for x in sys.argv)) f.write('\n\n') f.write(agent.get_info()) write_info(f, Environment) write_info(f, agent.__class__) write_info(f, agent.network.__class__) # From https://github.com/spragunr/deep_q_rl/pull/49/files with open(os.path.join(self.log_dir, git_name), 'w') as f: f.write(subprocess.check_output(['git', 'diff', 'HEAD'])) if folder is not None: return with open(os.path.join(self.log_dir, 'results.csv'), 'w') as f: f.write("epoch,episode_train,validate_values,evaluate_reward"\ ",train_time,test_time,steps_per_second\n") mem = psutil.virtual_memory() with open(os.path.join(self.log_dir, 'memory.csv'), 'w') as f: f.write("epoch,available,free,buffers,cached"\ ",available_readable,used_percent\n") f.write("%d,%d,%d,%d,%d,%s,%.1f\n" % \ (0, mem.available, mem.free, mem.buffers, mem.cached , bytes2human(mem.available), mem.percent)) def _update_log_files(self, agent, epoch, episode, valid_values, eval_values, train_time, test_time, step_per_sec, store_freq): print "Updating log files" with open(self.log_dir + '/results.csv', 'a') as f: f.write("%d,%d,%.4f,%.4f,%d,%d,%.4f\n" % \ (epoch, episode, valid_values, eval_values , train_time, test_time, step_per_sec)) mem = psutil.virtual_memory() with open(self.log_dir + '/memory.csv', 'a') as f: f.write("%d,%d,%d,%d,%d,%s,%.1f\n" % \ (epoch, mem.available, mem.free, mem.buffers, mem.cached , bytes2human(mem.available), mem.percent)) agent.dump_network(self.network_dir + ('/%03d' % (epoch)) + '.npz') if (store_freq >= 0 and epoch >= Environment.EPOCH_COUNT) or \ (store_freq > 0 and (epoch % store_freq == 0)): agent.dump_exp(self.network_dir + '/exp.npz') def _setup_record(self, network_file): file_name, _ = os.path.splitext(os.path.basename(network_file)) time_str = time.strftime("_%m-%d-%H-%M", time.localtime()) img_dir = os.path.dirname(network_file) + '/images_' \ + file_name + time_str rom_name, _ = os.path.splitext(self.rom_name) out_name = os.path.dirname(network_file) + '/' + rom_name + '_' \ + file_name + time_str + '.mov' print out_name try: os.stat(img_dir) except OSError: os.makedirs(img_dir) self.api.setString('record_screen_dir', img_dir) self.api.loadROM('../rom/' + self.rom_name) return img_dir, out_name def record_run(self, agent, network_file, episode_id=1): if episode_id > 1: self.evaluate(agent, episode_id - 1) system_state = self.api.cloneSystemState() img_dir, out_name = self._setup_record(network_file) if episode_id > 1: self.api.restoreSystemState(system_state) self.evaluate(agent, 1) script = \ """ { ffmpeg -r 60 -i %s/%%06d.png -f mov -c:v libx264 %s } || { avconv -r 60 -i %s/%%06d.png -f mov -c:v libx264 %s } """ % (img_dir, out_name, img_dir, out_name) os.system(script)