示例#1
0
class AtariWrapper():
    """
    ALE wrapper that tries to mimic the options in the DQN paper including the 
    preprocessing (except resizing/cropping)
    """
    action_words = [
        'NOOP', 'UP', 'RIGHT', 'LEFT', 'DOWN', "UPRIGHT", "UPLEFT",
        "DOWNRIGHT", "DOWNLEFT"
    ]
    _action_set = [0, 2, 3, 4, 5, 6, 7, 8, 9]
    #Valid actions for ALE.
    #Possible actions are just a list from 0,num_valid_actions
    #We still need to map from the latter to the former when

    possible_actions = list(range(len(_action_set)))

    def __init__(self,
                 rom_path,
                 seed=123,
                 frameskip=4,
                 show_display=False,
                 stack_num_states=4,
                 concatenate_state_every=4):
        """

        Parameters:
            Frameskip should be either a tuple (indicating a random range to
            choose from, with the top value exclude), or an int. It's aka action repeat.

            stack_num_states: Number of dimensions/channels to have. 

            concatenate_state_every: After how many frames should one channel be appended to state.
                Number is in terms of absolute frames independent of frameskip
        """

        self.stack_num_states = stack_num_states
        self.concatenate_state_every = concatenate_state_every

        self.game_path = rom_path
        if not os.path.exists(self.game_path):
            raise IOError('You asked for game %s but path %s does not exist' %
                          (game, self.game_path))
        self.frameskip = frameskip

        try:
            self.ale = ALEInterface()
        except Exception as e:
            print(
                "ALEInterface could not be loaded. ale_python_interface import failed"
            )
            raise e

        #Set some default options
        self.ale.setInt(b'random_seed', seed)
        self.ale.setBool(b'sound', False)
        self.ale.setBool(b'display_screen', show_display)
        self.ale.setFloat(b'repeat_action_probability', 0.)

        #Load the rom
        self.ale.loadROM(self.game_path)

        (self.screen_width, self.screen_height) = self.ale.getScreenDims()
        self.latest_frame_fifo = deque(
            maxlen=2)  #Holds the two closest frames to max.
        self.state_fifo = deque(maxlen=stack_num_states)

    def _step(self, a, force_noop=False):
        """Perform one step of the environment. 
        Automatically repeats the step self.frameskip number of times
        
        parameters:
            force_noop: Force it to perform a no-op ignoring the action supplied. 
        """
        assert a in self.possible_actions + [0]

        if force_noop:
            action, num_steps = 0, 1
        else:
            action = self._action_set[a]

        if isinstance(self.frameskip, int):
            num_steps = self.frameskip
        else:
            num_steps = np.random.randint(self.frameskip[0], self.frameskip[1])

        reward = 0.0
        for i in range(num_steps):
            reward += self.ale.act(action)
            cur_frame = self.observe_raw(get_rgb=True)
            cur_frame_cropped = self.crop_frame(cur_frame)
            self.latest_frame_fifo.append(cur_frame_cropped)

            if i % self.concatenate_state_every == 0:
                curmax_frame = np.amax(self.latest_frame_fifo, axis=0)
                frame_lumi = self.convert_to_gray(curmax_frame)
                self.state_fifo.append(frame_lumi)

        #Transpose so we get HxWxC instead of CxHxW
        self.current_frame = np.array(np.transpose(self.state_fifo, (1, 2, 0)))
        return self.current_frame, reward, self.ale.game_over(), {
            "ale.lives": self.ale.lives()
        }

    def step(self, *args, **kwargs):
        """Performs one step of the environment
        """
        lives_before = self.ale.lives()
        next_state, reward, done, info = self._step(*args, **kwargs)
        lives_after = self.ale.lives()

        # End the episode when a life is lost
        if lives_before > lives_after:
            done = True

        return next_state, reward, done, info

    def observe_raw(self, get_rgb=False):
        """Observe either RGB or Gray frames. 
        Initialzing arrays forces it to not modify stale pointers
        """
        if get_rgb:
            cur_frame_rgb = np.zeros(
                (self.screen_height, self.screen_width, 3), dtype=np.uint8)
            self.ale.getScreenRGB(cur_frame_rgb)
            return cur_frame_rgb
        else:
            cur_frame_gray = np.zeros((self.screen_height, self.screen_width),
                                      dtype=np.uint8)
            self.ale.getScreenGrayscale(cur_frame_gray)
            return cur_frame_gray

    def crop_frame(self, frame):
        """Simply crops a frame. Does nothing by default.
        """
        return frame

    def convert_to_gray(self, img):
        """Get Luminescence channel 
        """
        img_f = np.float32(img)
        img_lumi = 0.299*img_f[:,:,0] + \
                    0.587*img_f[:,:,1] + \
                    0.114*img_f[:,:,2]
        return np.uint8(img_lumi)

    def reset(self):
        """Reset the game
        """
        self.ale.reset_game()
        s = self.observe_raw(get_rgb=True)
        s = self.crop_frame(s)

        #Populate missing frames with blank ones.
        for _ in range(self.stack_num_states - 1):
            self.state_fifo.append(np.zeros(shape=(s.shape[0], s.shape[1])))

        self.latest_frame_fifo.append(s)

        #Push the latest frame
        curmax_frame = s
        frame_lumi = self.convert_to_gray(s)
        self.state_fifo.append(frame_lumi)

        self.state = np.transpose(self.state_fifo, (1, 2, 0))
        return self.state

    def get_action_meanings(self):
        """Return in text what the actions correspond to.
        """
        return [ACTION_MEANING[i] for i in self._action_set]

    def save_state(self):
        """Saves the current state and returns a identifier to saved state
        """
        return self.ale.cloneSystemState()

    def restore_state(self, ident):
        """Restore game state
        Restores the saved state of the system and perform a no-op
        so a new frame can be generated incase a restore is followed
        by an observe()
        """

        self.ale.restoreSystemState(ident)
        self.step(0, force_noop=True)
示例#2
0
class Environment:
	"""docstring for Environment"""

	BUFFER_LEN = 2
	EPISODE_FRAMES = 18000
	EPOCH_COUNT = 200
	EPOCH_STEPS = 250000
	EVAL_EPS = 0.001
	FRAMES_SKIP = 4
	FRAME_HEIGHT = 84
	FRAME_WIDTH = 84
	MAX_NO_OP = 30
	MAX_REWARD = 1
	
	def __init__(self, rom_name, rng, display_screen = False):
		self.api = ALEInterface()
		self.api.setInt('random_seed', rng.randint(333))
		self.api.setBool('display_screen', display_screen)
		self.api.setFloat('repeat_action_probability', 0.0)
		self.rom_name = rom_name
		self.display_screen = display_screen
		self.rng = rng
		self.repeat = Environment.FRAMES_SKIP
		self.buffer_len = Environment.BUFFER_LEN
		self.height = Environment.FRAME_HEIGHT
		self.width = Environment.FRAME_WIDTH
		self.episode_steps = Environment.EPISODE_FRAMES / Environment.FRAMES_SKIP
		self.merge_id = 0
		self.max_reward = Environment.MAX_REWARD
		self.eval_eps = Environment.EVAL_EPS
		self.log_dir = ''
		self.network_dir = ''

		self.api.loadROM('../rom/' + self.rom_name)
		self.minimal_actions = self.api.getMinimalActionSet()
		original_width, original_height = self.api.getScreenDims()
		self.merge_frame = np.zeros((self.buffer_len
								, original_height
								, original_width)
								, dtype = np.uint8)

	def get_action_count(self):
		return len(self.minimal_actions)

	def train(self, agent, store_freq, folder = None, start_epoch = 0):
		self._open_log_files(agent, folder)
		obs = np.zeros((self.height, self.width), dtype = np.uint8)
		epoch_count = Environment.EPOCH_COUNT

		for epoch in xrange(start_epoch, epoch_count):
			self.need_reset = True
			steps_left = Environment.EPOCH_STEPS

			print "\n" + "=" * 50
			print "Epoch #%d" % (epoch + 1)
			episode = 0
			train_start = time.time()
			while steps_left > 0:
				num_step, _ = self._run_episode(agent, steps_left, obs)
				steps_left -= num_step
				episode += 1
				if steps_left == 0 or episode % 10 == 0:
					print "Finished episode #%d, steps_left = %d" \
						% (episode, steps_left)
			train_end = time.time()

			valid_values = agent.get_validate_values()
			eval_values = self.evaluate(agent)
			test_end = time.time()

			train_time = train_end - train_start
			test_time = test_end - train_end
			step_per_sec = Environment.EPOCH_STEPS * 1. / max(1, train_time)
			print "\tFinished epoch #%d, episode trained = %d\n" \
				"\tValidate values = %.3f, evaluate reward = %.3f\n"\
				"\tTrain time = %.0fs, test time = %.0fs, steps/sec = %.4f" \
					% (epoch + 1, episode, valid_values, eval_values\
						, train_time, test_time, step_per_sec)

			self._update_log_files(agent, epoch + 1, episode
								, valid_values, eval_values
								, train_time, test_time
								, step_per_sec, store_freq)
			gc.collect()

	def evaluate(self, agent, episodes = 30, obs = None):
		print "\n***Start evaluating"
		if obs is None:
			obs = np.zeros((self.height, self.width), dtype = np.uint8)
		sum_reward = 0.0
		sum_step = 0.0
		for episode in xrange(episodes):
			self.need_reset = True
			step, reward = self._run_episode(agent, self.episode_steps, obs
											, self.eval_eps, evaluating = True)
			sum_reward += reward
			sum_step += step
			print "Finished episode %d, reward = %d, step = %d" \
					% (episode + 1, reward, step)
		self.need_reset = True
		print "Average reward per episode = %.4f" % (sum_reward / episodes)
		print "Average step per episode = %.4f" % (sum_step / episodes)
		return sum_reward / episodes

	def _prepare_game(self):
		if self.need_reset or self.api.game_over():
			self.api.reset_game()
			self.need_reset = False
			if Environment.MAX_NO_OP > 0:
				num_no_op = self.rng.randint(Environment.MAX_NO_OP + 1) \
							+ self.buffer_len
				for _ in xrange(num_no_op):
					self.api.act(0)

		for _ in xrange(self.buffer_len):
			self._update_buffer()

	def _run_episode(self, agent, steps_left, obs
					, eps = 0.0, evaluating = False):
		self._prepare_game()

		start_lives = self.api.lives()
		step_count = 0
		sum_reward = 0
		is_terminal = False
		while step_count < steps_left and not is_terminal:
			self._get_screen(obs)
			action_id, _ = agent.get_action(obs, eps, evaluating)
			
			reward = self._repeat_action(self.minimal_actions[action_id])
			reward_clip = reward
			if self.max_reward > 0:
				reward_clip = np.clip(reward, -self.max_reward, self.max_reward)

			life_lost = not evaluating and self.api.lives() < start_lives
			is_terminal = self.api.game_over() or life_lost \
						or step_count + 1 >= steps_left

			agent.add_experience(obs, is_terminal, action_id, reward_clip
								, evaluating)
			sum_reward += reward
			step_count += 1
			
		return step_count, sum_reward

	def _update_buffer(self):
		self.api.getScreenGrayscale(self.merge_frame[self.merge_id, ...])
		self.merge_id = (self.merge_id + 1) % self.buffer_len

	def _repeat_action(self, action):
		reward = 0
		for i in xrange(self.repeat):
			reward += self.api.act(action)
			if i + self.buffer_len >= self.repeat:
				self._update_buffer()
		return reward

	def _get_screen(self, resized_frame):
		self._resize_frame(self.merge_frame.max(axis = 0), resized_frame)
				
	def _resize_frame(self, src_frame, dst_frame):
		cv2.resize(src = src_frame, dst = dst_frame,
					dsize = (self.width, self.height),
					interpolation = cv2.INTER_LINEAR)

	def _open_log_files(self, agent, folder):
		time_str = time.strftime("_%m-%d-%H-%M", time.localtime())
		base_rom_name = os.path.splitext(os.path.basename(self.rom_name))[0]


		if folder is not None:
			self.log_dir = folder
			self.network_dir = self.log_dir + '/network'
		else:
			self.log_dir = '../run_results/' + base_rom_name + time_str
			self.network_dir = self.log_dir + '/network'

		info_name = get_next_name(self.log_dir, 'info', 'txt')
		git_name = get_next_name(self.log_dir, 'git-diff', '')

		try:
			os.stat(self.log_dir)
		except OSError:
			os.makedirs(self.log_dir)

		try:
			os.stat(self.network_dir)
		except OSError:
			os.makedirs(self.network_dir)

		with open(os.path.join(self.log_dir, info_name), 'w') as f:
			f.write('Commit: ' + subprocess.check_output(['git', 'rev-parse'
														, 'HEAD']))
			f.write('Run command: ')
			f.write(' '.join(pipes.quote(x) for x in sys.argv))
			f.write('\n\n')
			f.write(agent.get_info())
			write_info(f, Environment)
			write_info(f, agent.__class__)
			write_info(f, agent.network.__class__)

		# From https://github.com/spragunr/deep_q_rl/pull/49/files
		with open(os.path.join(self.log_dir, git_name), 'w') as f:
			f.write(subprocess.check_output(['git', 'diff', 'HEAD']))

		if folder is not None:
			return

		with open(os.path.join(self.log_dir, 'results.csv'), 'w') as f:
			f.write("epoch,episode_train,validate_values,evaluate_reward"\
				",train_time,test_time,steps_per_second\n")

		mem = psutil.virtual_memory()
		with open(os.path.join(self.log_dir, 'memory.csv'), 'w') as f:
			f.write("epoch,available,free,buffers,cached"\
					",available_readable,used_percent\n")
			f.write("%d,%d,%d,%d,%d,%s,%.1f\n" % \
					(0, mem.available, mem.free, mem.buffers, mem.cached
					, bytes2human(mem.available), mem.percent))

	def _update_log_files(self, agent, epoch, episode, valid_values
						, eval_values, train_time, test_time, step_per_sec
						, store_freq):
		print "Updating log files"
		with open(self.log_dir + '/results.csv', 'a') as f:
			f.write("%d,%d,%.4f,%.4f,%d,%d,%.4f\n" % \
						(epoch, episode, valid_values, eval_values
						, train_time, test_time, step_per_sec))

		mem = psutil.virtual_memory()
		with open(self.log_dir + '/memory.csv', 'a') as f:
			f.write("%d,%d,%d,%d,%d,%s,%.1f\n" % \
					(epoch, mem.available, mem.free, mem.buffers, mem.cached
					, bytes2human(mem.available), mem.percent))

		agent.dump_network(self.network_dir + ('/%03d' % (epoch)) + '.npz')

		if (store_freq >= 0 and epoch >= Environment.EPOCH_COUNT) or \
				(store_freq > 0 and (epoch % store_freq == 0)):
			agent.dump_exp(self.network_dir + '/exp.npz')

	def _setup_record(self, network_file):
		file_name, _ = os.path.splitext(os.path.basename(network_file))
		time_str = time.strftime("_%m-%d-%H-%M", time.localtime())
		img_dir = os.path.dirname(network_file) + '/images_' \
					+ file_name + time_str
		rom_name, _ = os.path.splitext(self.rom_name)
		out_name = os.path.dirname(network_file) + '/' + rom_name + '_' \
					+ file_name + time_str + '.mov'
		print out_name

		try:
			os.stat(img_dir)
		except OSError:
			os.makedirs(img_dir)

		self.api.setString('record_screen_dir', img_dir)
		self.api.loadROM('../rom/' + self.rom_name)
		return img_dir, out_name

	def record_run(self, agent, network_file, episode_id = 1):
		if episode_id > 1:
			self.evaluate(agent, episode_id - 1)
			system_state = self.api.cloneSystemState()

		img_dir, out_name = self._setup_record(network_file)

		if episode_id > 1:
			self.api.restoreSystemState(system_state)

		self.evaluate(agent, 1)
		script = \
				"""
					{
						ffmpeg -r 60 -i %s/%%06d.png -f mov -c:v libx264 %s
					} || {
						avconv -r 60 -i %s/%%06d.png -f mov -c:v libx264 %s
					}
				""" % (img_dir, out_name, img_dir, out_name)
		os.system(script)
def main():
    result = {
        'name': [],
        'grouped_num': [],
        'distribution': [],
    }
    result_str = ''

    # all_game_list = ['air_raid-n', 'alien', 'amidar', 'assault', 'asterix', 'asteroids', 'atlantis']
    # all_game_list = ['bank_heist', 'battle_zone', 'beam_rider', 'berzerk-n', 'bowling', 'boxing', 'breakout', 'carnival-n']
    # all_game_list = ['centipede', 'chopper_command', 'crazy_climber', 'demon_attack', 'double_dunk']
    # all_game_list = ['elevator_action-n', 'enduro', 'fishing_derby', 'freeway', 'frostbite', 'gopher', 'gravitar']
    # all_game_list = ['hero', 'ice_hockey', 'jamesbond', 'journey_escape-n', 'kangaroo', 'krull', 'kung_fu_master']
    # all_game_list = ['montezuma_revenge-n', 'ms_pacman', 'name_this_game', 'phoenix-n', 'pitfall-n', 'pong', 'pooyan-n']
    # all_game_list = ['private_eye', 'qbert', 'riverraid', 'road_runner', 'robotank', 'seaquest', 'skiing-n']
    # all_game_list = ['solaris-n', 'space_invaders', 'star_gunner', 'tennis', 'time_pilot', 'tutankham', 'up_n_down']
    # all_game_list = ['venture', 'video_pinball', 'wizard_of_wor', 'yars_revenge-n', 'zaxxon']

    # all_game_list = ['pong', 'assault','ms_pacman']
    all_game_list = ['assault']

    for game in all_game_list:

        if '-n' in game:
            '''games that are not in the nature DQN list'''
            continue

        import atari_py
        game_path = atari_py.get_game_path(game)
        game_path = str.encode(game_path)

        env = ALEInterface()
        env.setFloat('repeat_action_probability'.encode('utf-8'), 0.0)
        env.setInt(b'random_seed', 3)
        env.loadROM(game_path)
        env.reset_game()

        if test in ['restoreState']:
            state_after_reset = env.cloneState()
        if test in ['restoreSystemState']:
            state_after_reset = env.cloneSystemState()
        if test in ['setRAM']:
            ram_after_reset = env.getRAM()
            state_after_reset = env.cloneSystemState()
            ram_candidate = np.load(
                './stochasticity_ram_mask/{}.npy'.format(game), )

        print('=====================================================')
        try:
            action_sequence = np.load(
                './action_sequence/action_sequence_{}_{}.npy'.format(
                    sequence,
                    game,
                ))
            print('action_sequence loaded')
        except Exception as e:
            '''generate a sequence of actions'''
            action_sequence = np.random.randint(
                len(env.getMinimalActionSet()),
                size=sequence,
            )
            np.save(
                './action_sequence/action_sequence_{}_{}.npy'.format(
                    sequence,
                    game,
                ),
                action_sequence,
            )
            print('action_sequence generated')
        print('=====================================================')

        bunch_obs = []
        distribution = []
        episode_length = -1
        state_metrix = []
        ram_metrix = []
        for bunch_i in range(bunch):

            if test in ['loadROM']:
                env.setInt(b'random_seed', bunch_i)
                env.loadROM(game_path)
                env.reset_game()
            elif test in ['restoreState']:
                env.restoreState(state_after_reset)
            elif test in ['restoreSystemState']:
                env.restoreSystemState(state_after_reset)
            elif test in ['setRAM']:
                env.reset_game()
                env.restoreSystemState(state_after_reset)
                env.setRAM(ram_after_reset)
                env.setRAM(env.getRAM() * (1 - ram_candidate) + ram_candidate *
                           (bunch_i % 255))

            state_sequence = []
            ram_sequence = []

            has_terminated = False
            for sequence_i in range(sequence):

                for frame_skip_i in range(frame_skip):
                    if not has_terminated:
                        env.act(env.getMinimalActionSet()[
                            action_sequence[sequence_i]])
                        if env.game_over():
                            episode_length = sequence_i
                            has_terminated = True
                    if has_terminated:
                        break

                try:
                    clear_print('[{}|{}|{}]'.format(bunch_i, sequence_i,
                                                    episode_length))
                except Exception as e:
                    pass

                state_sequence += [env.getScreenRGB()]
                ram_sequence += [process_ram(env.getRAM())]

                if has_terminated:
                    break

            if sequence > 0:
                if episode_length < 0:
                    # raise Exception('Did not terminated')
                    print('# WARNING: Did not terminated')

            obs = env.getScreenRGB()

            state_metrix += [copy.deepcopy(state_sequence)]
            ram_metrix += [copy.deepcopy(ram_sequence)]

            if_has_identical_one = False
            for bunch_obs_i in range(len(bunch_obs)):
                max_value = np.max(np.abs(obs - bunch_obs[bunch_obs_i]))
                if max_value < 1:
                    if_has_identical_one = True
                    distribution[bunch_obs_i] += 1
                    break

            if if_has_identical_one is False:
                bunch_obs += [obs]
                distribution += [1]

        grouped_num = len(bunch_obs)
        result_str = '{}game:{} grouped_num:{} distribution:{} \n'.format(
            result_str,
            game,
            grouped_num,
            distribution,
        )
        try:
            game_list += [game]
        except Exception as e:
            game_list = [game]
        try:
            grouped_num_list += [grouped_num]
        except Exception as e:
            grouped_num_list = [grouped_num]

        max_lenth = 0
        for bunch_i in range(len(state_metrix)):
            if len(state_metrix[bunch_i]) > max_lenth:
                max_lenth = len(state_metrix[bunch_i])
        for bunch_i in range(len(state_metrix)):
            state_metrix[bunch_i] += ([
                np.zeros(shape=state_metrix[0][0].shape,
                         dtype=state_metrix[0][0].dtype)
            ] * (max_lenth - len(state_metrix[bunch_i])))
            ram_metrix[bunch_i] += ([
                np.zeros(shape=ram_metrix[0][0].shape,
                         dtype=ram_metrix[0][0].dtype)
            ] * (max_lenth - len(state_metrix[bunch_i])))

        state_list = []
        state_metrix_id = np.zeros((len(state_metrix), len(state_metrix[0])),
                                   dtype=int)
        for bunch_i in range(len(state_metrix)):
            for sequence_i in range(len(state_metrix[0])):
                found_in_state_list = False
                for state_list_id in range(len(state_list)):
                    if np.max(state_list[state_list_id] -
                              state_metrix[bunch_i][sequence_i]) < 1:
                        state_metrix_id[bunch_i][sequence_i] = state_list_id
                        found_in_state_list = True
                        break
                if not found_in_state_list:
                    state_list += [np.copy(state_metrix[bunch_i][sequence_i])]
                    state_metrix_id[bunch_i][sequence_i] = (len(state_list) -
                                                            1)

        state_metrix_id_unsorted = np.copy(state_metrix_id)
        state_metrix_id = state_metrix_id.tolist()
        state_metrix_id.sort(key=lambda row: row[:], reverse=True)
        state_metrix_id = np.array(state_metrix_id)

        fig, ax = plt.subplots()
        im = ax.imshow(state_metrix_id)
        plt.show()
        plt.savefig(
            './results/{}_state_metrix_id.jpg'.format(game),
            dpi=600,
        )

        state_metrix_figure = np.zeros(
            ((10 + state_metrix[0][0].shape[0]) * len(state_metrix),
             state_metrix[0][0].shape[1] * len(state_metrix[0]),
             state_metrix[0][0].shape[2]),
            dtype=state_metrix[0][0].dtype)
        ram_metrix_figure = np.zeros(
            ((5 + ram_metrix[0][0].shape[0]) * len(state_metrix),
             ram_metrix[0][0].shape[1] * len(state_metrix[0]),
             ram_metrix[0][0].shape[2]),
            dtype=ram_metrix[0][0].dtype)

        ram_candidate = list(range(env.getRAMSize()))

        for bunch_i in range(len(state_metrix)):
            ram_metrix_figure[((bunch_i) * (5 + ram_metrix[0][0].shape[0])):(
                5 + (bunch_i) * (5 + ram_metrix[0][0].shape[0])), :, 2] = 255
        for bunch_i in range(len(state_metrix)):
            for sequence_i in range(len(state_metrix[0])):
                state_metrix_figure[
                    (10 + (bunch_i) *
                     (10 + state_metrix[0][0].shape[0])):(bunch_i + 1) *
                    (10 + state_metrix[0][0].shape[0]), (sequence_i) *
                    state_metrix[0][0].shape[1]:(sequence_i + 1) *
                    state_metrix[0][0].shape[1]] = state_list[
                        state_metrix_id[bunch_i][sequence_i]]
                for bunch_ii in range(state_metrix_id.shape[0]):
                    if np.max(state_metrix_id_unsorted[bunch_ii] -
                              state_metrix_id[bunch_i]) < 1:
                        at_unsorted_bunch = bunch_ii
                        break
                ram_metrix_figure[(
                    5 + (bunch_i) *
                    (5 + ram_metrix[0][0].shape[0])):(bunch_i + 1) *
                                  (5 + ram_metrix[0][0].shape[0]),
                                  (sequence_i) *
                                  ram_metrix[0][0].shape[1]:(sequence_i + 1) *
                                  ram_metrix[0][0].shape[1]] = ram_metrix[
                                      at_unsorted_bunch][sequence_i]

        for bunch_i in range(len(state_metrix)):
            for sequence_i in range(len(state_metrix[0])):
                if bunch_i > 0:
                    if state_metrix_id[bunch_i][sequence_i] != state_metrix_id[
                            bunch_i - 1][sequence_i]:
                        # draw a line to seperate the bunches
                        previous = ram_metrix_figure[(
                            5 + (bunch_i - 1) *
                            (5 + ram_metrix[0][0].shape[0])):(
                                (bunch_i) * (5 + ram_metrix[0][0].shape[0])),
                                                     sequence_i, 0]
                        later = ram_metrix_figure[(
                            5 + (bunch_i) * (5 + ram_metrix[0][0].shape[0])):(
                                (bunch_i + 1) *
                                (5 + ram_metrix[0][0].shape[0])), sequence_i,
                                                  0]
                        delta = np.abs(previous - later)
                        state_metrix_figure[(
                            (bunch_i) * (10 + state_metrix[0][0].shape[0])):(
                                10 + (bunch_i) *
                                (10 + state_metrix[0][0].shape[0])),
                                            (sequence_i) *
                                            state_metrix[0][0].shape[1]:,
                                            0] = 255
                        ram_metrix_figure[((bunch_i) *
                                           (5 + ram_metrix[0][0].shape[0])
                                           ):(5 + (bunch_i) *
                                              (5 + ram_metrix[0][0].shape[0])),
                                          (sequence_i) *
                                          ram_metrix[0][0].shape[1]:, 0] = 255
                        ram_metrix_figure[((bunch_i) *
                                           (5 + ram_metrix[0][0].shape[0])
                                           ):(5 + (bunch_i) *
                                              (5 + ram_metrix[0][0].shape[0])),
                                          (sequence_i) *
                                          ram_metrix[0][0].shape[1]:, 1:] = 0

        from PIL import Image
        Image.fromarray(state_metrix_figure).save(
            "./results/{}_state_metrix_figure.jpeg".format(game))
        Image.fromarray(ram_metrix_figure.astype(
            state_metrix_figure.dtype)).save(
                "./results/{}_ram_metrix_figure.jpeg".format(game))

    print(result_str)
    print('===============')
    for game_i in range(len(game_list)):
        print(game_list[game_i])
    for grouped_num_i in range(len(grouped_num_list)):
        print(grouped_num_list[grouped_num_i])
示例#4
0
class aleForET:
    def __init__(self, rom_file, screen, rndseed, resume_state_file=None):
        # When you might pass None to screen:
        # You are not interested in running any functions that displays graphics
        # For example, you should only run proceed_one_step__fast__no_scr_support()
        # Otherwise, those functions uses self.screen and you will get a RuntimeError
        if screen != None:
            pygame.init()
            self.screen = screen
        GAME_W, GAME_H = 160, 210
        self.size = GAME_W * V.xSCALE, GAME_H * V.ySCALE

        # Get & Set the desired settings
        self.ale = ALEInterface()
        self.ale.setInt("random_seed", rndseed)
        self.ale.setBool('sound', False)
        self.ale.setBool('display_screen', False)
        self.ale.setBool('color_averaging', COLOR_AVG)
        self.ale.setFloat('repeat_action_probability', 0.0)

        # Load the ROM file
        self.ale.loadROM(rom_file)
        self.gamename = os.path.basename(rom_file).split('.')[0]
        self.clock = pygame.time.Clock()
        self._last_time = time.time()
        self.score = 0
        self.episode = 0
        self.frame_cnt = 0

        # Get the list of legal actions
        self.legal_actions = self.ale.getLegalActionSet()
        if resume_state_file:
            self.loadALEState(resume_state_file)

    def saveALEState(self, fname):
        basedir = os.path.dirname(fname)
        if not os.path.exists(basedir):
            os.makedirs(basedir)
        pALEState = self.ale.cloneSystemState(
        )  # actually it returns an int, a memory address pointing to a C++ object ALEState
        serialized_np = self.ale.encodeState(
            pALEState)  # this func actually takes a pointer
        np.savez(fname,
                 state=serialized_np,
                 score=self.score,
                 episode=self.episode)

    def loadALEState(self, fname):
        npzfile = np.load(fname)
        serialized_np = npzfile['state']
        self.score = npzfile['score']
        self.episode = npzfile['episode']
        pALEState = self.ale.decodeState(
            serialized_np
        )  # actually it returns an int, a memory address pointing to a C++ object ALEState
        self.ale.restoreSystemState(
            pALEState)  # this func actually takes a pointer

    def proceed_one_step(self,
                         action,
                         refresh_screen=False,
                         fps_limit=0,
                         model_gaze_output=None,
                         gc_window_drawer_func=None):
        self.clock.tick(
            fps_limit)  # control FPS. fps_limit == 0 means no limit
        self.frame_cnt += 1

        # Display FPS
        diff_time = time.time() - self._last_time
        if diff_time > 1.0:
            print 'FPS: %.1f' % self.clock.get_fps()
            self._last_time = time.time()

        # Show game image
        cur_frame_np = self.ale.getScreenRGB()
        if refresh_screen:
            cur_frame_Surface = pygame.surfarray.make_surface(cur_frame_np)
            cur_frame_Surface = pygame.transform.flip(cur_frame_Surface, True,
                                                      False)
            cur_frame_Surface = pygame.transform.rotate(cur_frame_Surface, 90)
            # Perform scaling directly on screen, leaving cur_frame_Surface unscaled.
            # Slightly faster than scaling cur_frame_Surface and then transfer to screen.
            pygame.transform.scale(cur_frame_Surface, self.size, self.screen)

            if gc_window_drawer_func != None and model_gaze_output:
                gc_window_drawer_func(self.screen, model_gaze_output)
            pygame.display.flip()

        # Apply an action and get the resulting reward
        reward = self.ale.act(action)
        self.score += reward

        return cur_frame_np, reward, self.check_episode_end_and_if_true_reset_game(
        )

    def proceed_one_step__fast__no_scr_support(self, action):
        self.frame_cnt += 1
        cur_frame_np = self.ale.getScreenRGB()
        reward = self.ale.act(action)
        self.score += reward
        return cur_frame_np, reward, self.check_episode_end_and_if_true_reset_game(
        )

    def check_episode_end_and_if_true_reset_game(self):
        end = self.ale.game_over()
        if end:
            print 'Episode', self.episode, 'ended with score:', self.score
            self.score = 0
            self.episode += 1
            self.ale.reset_game()
        return end  # after reset_game(),  ale.game_over()'s return value will change to false

    def run(self,
            gc_window_drawer_func=None,
            save_screen_func=None,
            event_handler_func=None,
            record_a_and_r_func=None):
        self.run_start_time = time.time()  # used in alerecord_main.py
        while True:
            self.check_episode_end_and_if_true_reset_game()
            self.clock.tick(FRAME_RATE)  # control FPS
            self.frame_cnt += 1

            key = pygame.key.get_pressed()
            if event_handler_func != None:
                stop, eyelink_err_code, bool_drawgc = event_handler_func(
                    key, self)
                if stop:
                    return eyelink_err_code

            # Display FPS
            diff_time = time.time() - self._last_time
            if diff_time > 1.0:
                print 'FPS: %.1f' % self.clock.get_fps()
                self._last_time = time.time()

            # Show game image
            cur_frame_np = self.ale.getScreenRGB()
            cur_frame_Surface = pygame.surfarray.make_surface(cur_frame_np)
            cur_frame_Surface = pygame.transform.flip(cur_frame_Surface, True,
                                                      False)
            cur_frame_Surface = pygame.transform.rotate(cur_frame_Surface, 90)
            # Perform scaling directly on screen, leaving cur_frame_Surface unscaled.
            # Slightly faster than scaling cur_frame_Surface and then transfer to screen.
            pygame.transform.scale(cur_frame_Surface, self.size, self.screen)

            if gc_window_drawer_func != None and bool_drawgc:
                gc_window_drawer_func(self.screen)
            pygame.display.flip()

            # Save frame to disk (160*210, i.e. not scaled; because this is faster)
            if save_screen_func != None:
                save_screen_func(cur_frame_Surface, self.frame_cnt)

            # Apply an action and get the resulting reward
            a_index = aenum.action_map(key, self.gamename)
            a = self.legal_actions[a_index]
            reward = self.ale.act(a)
            self.score += reward
            if record_a_and_r_func != None:
                record_a_and_r_func(a, reward, self.episode, self.score)

            pygame.event.pump()  # need this line to get new key pressed
        assert False, "Returning should only happen in the while True loop"

    def run_in_step_by_step_mode(self,
                                 gc_window_drawer_func=None,
                                 save_screen_func=None,
                                 event_handler_func=None,
                                 record_a_and_r_func=None):
        bool_drawgc = False
        self.run_start_time = time.time()  # used in alerecord_main.py
        while True:
            self.check_episode_end_and_if_true_reset_game()
            # Get game image
            cur_frame_np = self.ale.getScreenRGB()
            cur_frame_Surface = pygame.surfarray.make_surface(cur_frame_np)
            cur_frame_Surface = pygame.transform.flip(cur_frame_Surface, True,
                                                      False)
            cur_frame_Surface = pygame.transform.rotate(cur_frame_Surface, 90)

            self.frame_cnt += 1
            # Save frame to disk (160*210, i.e. not scaled; because this is faster)
            if save_screen_func != None:
                save_screen_func(cur_frame_Surface, self.frame_cnt)

            key, draw_next_game_frame = None, False
            while not draw_next_game_frame:
                self.clock.tick(FRAME_RATE)  # control FPS

                key = pygame.key.get_pressed()
                if event_handler_func != None:
                    stop, eyelink_err_code, bool_drawgc = event_handler_func(
                        key, self)
                    if stop:
                        return eyelink_err_code
                a_index = aenum.action_map(key, self.gamename)
                # Not in all cases when action_map returns "NO OP" is the real action "NO OP",
                # Only when the human press "TAB", is the real action "NO OP".
                if (a_index == aenum.PLAYER_A_NOOP and key[pygame.K_TAB]) \
                or  a_index != aenum.PLAYER_A_NOOP:
                    draw_next_game_frame = True

                # Draw the image onto screen.
                # Perform scaling directly on screen, leaving cur_frame_Surface unscaled.
                pygame.transform.scale(cur_frame_Surface, self.size,
                                       self.screen)

                if gc_window_drawer_func != None and bool_drawgc:
                    gc_window_drawer_func(self.screen)

                pygame.display.flip()
                pygame.event.pump()  # need this line to get new key pressed

            # Apply an action and get the resulting reward
            a = self.legal_actions[a_index]
            reward = self.ale.act(a)
            self.score += reward
            if record_a_and_r_func != None:
                record_a_and_r_func(a, reward, self.episode, self.score)
        assert False, "Returning code should only be in the while True loop"
示例#5
0
class Environment:
    """docstring for Environment"""

    BUFFER_LEN = 2
    EPISODE_FRAMES = 18000
    EPOCH_COUNT = 200
    EPOCH_STEPS = 250000
    EVAL_EPS = 0.001
    FRAMES_SKIP = 4
    FRAME_HEIGHT = 84
    FRAME_WIDTH = 84
    MAX_NO_OP = 30
    MAX_REWARD = 1

    def __init__(self, rom_name, rng, display_screen=False):
        self.api = ALEInterface()
        self.api.setInt('random_seed', rng.randint(333))
        self.api.setBool('display_screen', display_screen)
        self.api.setFloat('repeat_action_probability', 0.0)
        self.rom_name = rom_name
        self.display_screen = display_screen
        self.rng = rng
        self.repeat = Environment.FRAMES_SKIP
        self.buffer_len = Environment.BUFFER_LEN
        self.height = Environment.FRAME_HEIGHT
        self.width = Environment.FRAME_WIDTH
        self.episode_steps = Environment.EPISODE_FRAMES / Environment.FRAMES_SKIP
        self.merge_id = 0
        self.max_reward = Environment.MAX_REWARD
        self.eval_eps = Environment.EVAL_EPS
        self.log_dir = ''
        self.network_dir = ''

        self.api.loadROM('../rom/' + self.rom_name)
        self.minimal_actions = self.api.getMinimalActionSet()
        original_width, original_height = self.api.getScreenDims()
        self.merge_frame = np.zeros(
            (self.buffer_len, original_height, original_width), dtype=np.uint8)

    def get_action_count(self):
        return len(self.minimal_actions)

    def train(self, agent, store_freq, folder=None, start_epoch=0):
        self._open_log_files(agent, folder)
        obs = np.zeros((self.height, self.width), dtype=np.uint8)
        epoch_count = Environment.EPOCH_COUNT

        for epoch in xrange(start_epoch, epoch_count):
            self.need_reset = True
            steps_left = Environment.EPOCH_STEPS

            print "\n" + "=" * 50
            print "Epoch #%d" % (epoch + 1)
            episode = 0
            train_start = time.time()
            while steps_left > 0:
                num_step, _ = self._run_episode(agent, steps_left, obs)
                steps_left -= num_step
                episode += 1
                if steps_left == 0 or episode % 10 == 0:
                    print "Finished episode #%d, steps_left = %d" \
                     % (episode, steps_left)
            train_end = time.time()

            valid_values = agent.get_validate_values()
            eval_values = self.evaluate(agent)
            test_end = time.time()

            train_time = train_end - train_start
            test_time = test_end - train_end
            step_per_sec = Environment.EPOCH_STEPS * 1. / max(1, train_time)
            print "\tFinished epoch #%d, episode trained = %d\n" \
             "\tValidate values = %.3f, evaluate reward = %.3f\n"\
             "\tTrain time = %.0fs, test time = %.0fs, steps/sec = %.4f" \
              % (epoch + 1, episode, valid_values, eval_values\
               , train_time, test_time, step_per_sec)

            self._update_log_files(agent, epoch + 1, episode, valid_values,
                                   eval_values, train_time, test_time,
                                   step_per_sec, store_freq)
            gc.collect()

    def evaluate(self, agent, episodes=30, obs=None):
        print "\n***Start evaluating"
        if obs is None:
            obs = np.zeros((self.height, self.width), dtype=np.uint8)
        sum_reward = 0.0
        sum_step = 0.0
        for episode in xrange(episodes):
            self.need_reset = True
            step, reward = self._run_episode(agent,
                                             self.episode_steps,
                                             obs,
                                             self.eval_eps,
                                             evaluating=True)
            sum_reward += reward
            sum_step += step
            print "Finished episode %d, reward = %d, step = %d" \
              % (episode + 1, reward, step)
        self.need_reset = True
        print "Average reward per episode = %.4f" % (sum_reward / episodes)
        print "Average step per episode = %.4f" % (sum_step / episodes)
        return sum_reward / episodes

    def _prepare_game(self):
        if self.need_reset or self.api.game_over():
            self.api.reset_game()
            self.need_reset = False
            if Environment.MAX_NO_OP > 0:
                num_no_op = self.rng.randint(Environment.MAX_NO_OP + 1) \
                   + self.buffer_len
                for _ in xrange(num_no_op):
                    self.api.act(0)

        for _ in xrange(self.buffer_len):
            self._update_buffer()

    def _run_episode(self, agent, steps_left, obs, eps=0.0, evaluating=False):
        self._prepare_game()

        start_lives = self.api.lives()
        step_count = 0
        sum_reward = 0
        is_terminal = False
        while step_count < steps_left and not is_terminal:
            self._get_screen(obs)
            action_id, _ = agent.get_action(obs, eps, evaluating)

            reward = self._repeat_action(self.minimal_actions[action_id])
            reward_clip = reward
            if self.max_reward > 0:
                reward_clip = np.clip(reward, -self.max_reward,
                                      self.max_reward)

            life_lost = not evaluating and self.api.lives() < start_lives
            is_terminal = self.api.game_over() or life_lost \
               or step_count + 1 >= steps_left

            agent.add_experience(obs, is_terminal, action_id, reward_clip,
                                 evaluating)
            sum_reward += reward
            step_count += 1

        return step_count, sum_reward

    def _update_buffer(self):
        self.api.getScreenGrayscale(self.merge_frame[self.merge_id, ...])
        self.merge_id = (self.merge_id + 1) % self.buffer_len

    def _repeat_action(self, action):
        reward = 0
        for i in xrange(self.repeat):
            reward += self.api.act(action)
            if i + self.buffer_len >= self.repeat:
                self._update_buffer()
        return reward

    def _get_screen(self, resized_frame):
        self._resize_frame(self.merge_frame.max(axis=0), resized_frame)

    def _resize_frame(self, src_frame, dst_frame):
        cv2.resize(src=src_frame,
                   dst=dst_frame,
                   dsize=(self.width, self.height),
                   interpolation=cv2.INTER_LINEAR)

    def _open_log_files(self, agent, folder):
        time_str = time.strftime("_%m-%d-%H-%M", time.localtime())
        base_rom_name = os.path.splitext(os.path.basename(self.rom_name))[0]

        if folder is not None:
            self.log_dir = folder
            self.network_dir = self.log_dir + '/network'
        else:
            self.log_dir = '../run_results/' + base_rom_name + time_str
            self.network_dir = self.log_dir + '/network'

        info_name = get_next_name(self.log_dir, 'info', 'txt')
        git_name = get_next_name(self.log_dir, 'git-diff', '')

        try:
            os.stat(self.log_dir)
        except OSError:
            os.makedirs(self.log_dir)

        try:
            os.stat(self.network_dir)
        except OSError:
            os.makedirs(self.network_dir)

        with open(os.path.join(self.log_dir, info_name), 'w') as f:
            f.write('Commit: ' +
                    subprocess.check_output(['git', 'rev-parse', 'HEAD']))
            f.write('Run command: ')
            f.write(' '.join(pipes.quote(x) for x in sys.argv))
            f.write('\n\n')
            f.write(agent.get_info())
            write_info(f, Environment)
            write_info(f, agent.__class__)
            write_info(f, agent.network.__class__)

        # From https://github.com/spragunr/deep_q_rl/pull/49/files
        with open(os.path.join(self.log_dir, git_name), 'w') as f:
            f.write(subprocess.check_output(['git', 'diff', 'HEAD']))

        if folder is not None:
            return

        with open(os.path.join(self.log_dir, 'results.csv'), 'w') as f:
            f.write("epoch,episode_train,validate_values,evaluate_reward"\
             ",train_time,test_time,steps_per_second\n")

        mem = psutil.virtual_memory()
        with open(os.path.join(self.log_dir, 'memory.csv'), 'w') as f:
            f.write("epoch,available,free,buffers,cached"\
              ",available_readable,used_percent\n")
            f.write("%d,%d,%d,%d,%d,%s,%.1f\n" % \
              (0, mem.available, mem.free, mem.buffers, mem.cached
              , bytes2human(mem.available), mem.percent))

    def _update_log_files(self, agent, epoch, episode, valid_values,
                          eval_values, train_time, test_time, step_per_sec,
                          store_freq):
        print "Updating log files"
        with open(self.log_dir + '/results.csv', 'a') as f:
            f.write("%d,%d,%.4f,%.4f,%d,%d,%.4f\n" % \
               (epoch, episode, valid_values, eval_values
               , train_time, test_time, step_per_sec))

        mem = psutil.virtual_memory()
        with open(self.log_dir + '/memory.csv', 'a') as f:
            f.write("%d,%d,%d,%d,%d,%s,%.1f\n" % \
              (epoch, mem.available, mem.free, mem.buffers, mem.cached
              , bytes2human(mem.available), mem.percent))

        agent.dump_network(self.network_dir + ('/%03d' % (epoch)) + '.npz')

        if (store_freq >= 0 and epoch >= Environment.EPOCH_COUNT) or \
          (store_freq > 0 and (epoch % store_freq == 0)):
            agent.dump_exp(self.network_dir + '/exp.npz')

    def _setup_record(self, network_file):
        file_name, _ = os.path.splitext(os.path.basename(network_file))
        time_str = time.strftime("_%m-%d-%H-%M", time.localtime())
        img_dir = os.path.dirname(network_file) + '/images_' \
           + file_name + time_str
        rom_name, _ = os.path.splitext(self.rom_name)
        out_name = os.path.dirname(network_file) + '/' + rom_name + '_' \
           + file_name + time_str + '.mov'
        print out_name

        try:
            os.stat(img_dir)
        except OSError:
            os.makedirs(img_dir)

        self.api.setString('record_screen_dir', img_dir)
        self.api.loadROM('../rom/' + self.rom_name)
        return img_dir, out_name

    def record_run(self, agent, network_file, episode_id=1):
        if episode_id > 1:
            self.evaluate(agent, episode_id - 1)
            system_state = self.api.cloneSystemState()

        img_dir, out_name = self._setup_record(network_file)

        if episode_id > 1:
            self.api.restoreSystemState(system_state)

        self.evaluate(agent, 1)
        script = \
          """
					{
						ffmpeg -r 60 -i %s/%%06d.png -f mov -c:v libx264 %s
					} || {
						avconv -r 60 -i %s/%%06d.png -f mov -c:v libx264 %s
					}
				""" % (img_dir, out_name, img_dir, out_name)
        os.system(script)