def eval_proc(file_name): print(file_name) f = open(os.path.join('./log_more', file_name), 'w+') types = ['RANDOM', 'RHCP', 'CDQN', 'MCT'] # for role_id in [2, 3, 1]: # for ta in types: # agent = make_agent(ta, role_id) # for i in range(1): # env = make_env('MCT') # st = StatCounter() # for j in tqdm(range(100)): # winning_rate = eval_episode(env, agent) # st.feed(winning_rate) # f.write('%s with role id %d against %s, winning rate: %f\n' % (ta, role_id, 'MCT', st.average)) for role_id in [2, 3, 1]: agent = make_agent('MCT', role_id) for i in range(1): for te in types: env = make_env(te) st = StatCounter() for j in tqdm(range(100)): winning_rate = eval_episode(env, agent) st.feed(winning_rate) f.write('%s with role id %d against %s, winning rate: %f\n' % ('MCT', role_id, te, st.average)) f.close()
def run_step(self): # sample noise z = np.random.normal(size=(GOAL_SAMPLE, 4)) # generate goals gz = self.generator(z)[0] if len(self.memory) > 0: idx = np.random.randint(len(self.memory), size=(GOAL_SAMPLE // 2, )) real_goals = np.stack([self.memory.sample(i) for i in idx], axis=0) real_labels = np.zeros([real_goals.shape[0]]) sampled_goals = np.concatenate([gz, real_goals], axis=0) else: real_goals = np.zeros([0, 2], np.float32) real_labels = np.zeros([0], np.int32) sampled_goals = np.concatenate([gz, gz[:GOAL_SAMPLE // 2]], axis=0) # label the goals if len(self.memory) > 0: for i in real_goals.shape[0]: stat = StatCounter() for _ in range(LABEL_EVAL): self.env.reset(real_goals[i]) done = False r = 0 while not done: _, r, done, _ = self.env.step( self.env.action_space.sample()) stat.feed(r) if R_MIN < stat.average < R_MAX: real_labels[i] = 1 gan_df_queue.put([z, real_goals, real_labels]) goals[:] = sampled_goals.reshape(-1) # update policy, batch size 500 for i in range(5): for j in range(1): logger.info('policy iter {},{}'.format(i, j)) self.hooked_sess.run(self.a3c_min) # train GAN, batch size 128 for i in range(2): logger.info('gan iter {}'.format(i)) self.hooked_sess.run(self.d_min) self.hooked_sess.run(self.g_min) # insert memory if len(self.memory) > 0: dist = np.min( cdist(gz, self.memory.goal[:len(self.memory)], 'euclidean'), -1) for i in range(gz.shape[0]): if dist[i] > EPSILON: self.memory.append(gz[i]) else: for i in range(gz.shape[0]): self.memory.append(gz[i])
class Unity3DPlayer(RLEnvironment): ACTION_TABLE = [(0.5, 0.0), # Forward (-0.5, 0.0), # Backward (0.5, 1.0), # Forward-Right (-0.5, 1.0), # Backward-Right (0.5, -1.0), # Forward-Left (-0.5, -1.0) ] # Backward-Left def __init__(self, connection, skip=1, dumpdir=None, viz=False, auto_restart=True): if connection != None: with _ENV_LOCK: self.gymenv = Unity3DEnvironment(server_address=connection) self.use_dir = dumpdir self.skip = skip self.reset_stat() self.rwd_counter = StatCounter() self.restart_episode() self.auto_restart = auto_restart self.viz = viz def restart_episode(self): self.rwd_counter.reset() self._ob = self.gymenv.reset() def finish_episode(self): self.stats['score'].append(self.rwd_counter.sum) def current_state(self): if self.viz: self.gymenv.render() time.sleep(self.viz) return self._ob def action(self, act): env_act = self.ACTION_TABLE[act] for i in range(self.skip): self._ob, r, isOver, info = self.gymenv.step(env_act) if r <= -1.0: isOver = True if isOver: break self.rwd_counter.feed(r) if isOver: self.finish_episode() if self.auto_restart: self.restart_episode() return r, isOver def get_action_space(self): return DiscreteActionSpace(len(self.ACTION_TABLE)) def close(self): self.gymenv.close()
def eval_with_funcs(predictors, nr_eval, get_player_fn): """ Args: predictors ([PredictorBase]) """ class Worker(StoppableThread, ShareSessionThread): def __init__(self, func, queue): super(Worker, self).__init__() self._func = func self.q = queue def func(self, *args, **kwargs): if self.stopped(): raise RuntimeError("stopped!") return self._func(*args, **kwargs) def run(self): with self.default_sess(): player = get_player_fn(train=False) while not self.stopped(): try: score = play_one_episode(player, self.func) # print("Score, ", score) except RuntimeError: return self.queue_put_stoppable(self.q, score) q = queue.Queue() threads = [Worker(f, q) for f in predictors] for k in threads: k.start() time.sleep(0.1) # avoid simulator bugs stat = StatCounter() for _ in tqdm(range(nr_eval), **get_tqdm_kwargs()): r = q.get() stat.feed(r) logger.info("Waiting for all the workers to finish the last run...") for k in threads: k.stop() for k in threads: k.join() while q.qsize(): r = q.get() stat.feed(r) if stat.count > 0: return (stat.average, stat.max) return (0, 0)
class AtariPlayer(RLEnvironment): """ A wrapper for atari emulator. Will automatically restart when a real episode ends (isOver might be just lost of lives but not game over). """ def __init__(self, rom_file, viz=0, height_range=(None, None), frame_skip=4, image_shape=(84, 84), nullop_start=30, live_lost_as_eoe=True): """ :param rom_file: path to the rom :param frame_skip: skip every k frames and repeat the action :param image_shape: (w, h) :param height_range: (h1, h2) to cut :param viz: visualization to be done. Set to 0 to disable. Set to a positive number to be the delay between frames to show. Set to a string to be a directory to store frames. :param nullop_start: start with random number of null ops :param live_losts_as_eoe: consider lost of lives as end of episode. useful for training. """ super(AtariPlayer, self).__init__() if not os.path.isfile(rom_file) and '/' not in rom_file: rom_file = get_dataset_path('atari_rom', rom_file) assert os.path.isfile(rom_file), \ "rom {} not found. Please download at {}".format(rom_file, ROM_URL) try: ALEInterface.setLoggerMode(ALEInterface.Logger.Warning) except AttributeError: if execute_only_once(): logger.warn("You're not using latest ALE") # avoid simulator bugs: https://github.com/mgbellemare/Arcade-Learning-Environment/issues/86 with _ALE_LOCK: self.ale = ALEInterface() self.rng = get_rng(self) self.ale.setInt(b"random_seed", self.rng.randint(0, 30000)) self.ale.setBool(b"showinfo", False) self.ale.setInt(b"frame_skip", 1) self.ale.setBool(b'color_averaging', False) # manual.pdf suggests otherwise. self.ale.setFloat(b'repeat_action_probability', 0.0) # viz setup if isinstance(viz, six.string_types): assert os.path.isdir(viz), viz self.ale.setString(b'record_screen_dir', viz) viz = 0 if isinstance(viz, int): viz = float(viz) self.viz = viz if self.viz and isinstance(self.viz, float): self.windowname = os.path.basename(rom_file) cv2.startWindowThread() cv2.namedWindow(self.windowname) self.ale.loadROM(rom_file.encode('utf-8')) self.width, self.height = self.ale.getScreenDims() self.actions = self.ale.getMinimalActionSet() self.live_lost_as_eoe = live_lost_as_eoe self.frame_skip = frame_skip self.nullop_start = nullop_start self.height_range = height_range self.image_shape = image_shape self.current_episode_score = StatCounter() self.restart_episode() def _grab_raw_image(self): """ :returns: the current 3-channel image """ m = self.ale.getScreenRGB() return m.reshape((self.height, self.width, 3)) def current_state(self): """ :returns: a gray-scale (h, w) uint8 image """ ret = self._grab_raw_image() # max-pooled over the last screen ret = np.maximum(ret, self.last_raw_screen) if self.viz: if isinstance(self.viz, float): cv2.imshow(self.windowname, ret) time.sleep(self.viz) ret = ret[self.height_range[0]:self.height_range[1], :].astype( 'float32') # 0.299,0.587.0.114. same as rgb2y in torch/image ret = cv2.cvtColor(ret, cv2.COLOR_RGB2GRAY) ret = cv2.resize(ret, self.image_shape) return ret.astype('uint8') # to save some memory def get_action_space(self): return DiscreteActionSpace(len(self.actions)) def finish_episode(self): self.stats['score'].append(self.current_episode_score.sum) def restart_episode(self): self.current_episode_score.reset() with _ALE_LOCK: self.ale.reset_game() # random null-ops start n = self.rng.randint(self.nullop_start) self.last_raw_screen = self._grab_raw_image() for k in range(n): if k == n - 1: self.last_raw_screen = self._grab_raw_image() self.ale.act(0) def action(self, act): """ :param act: an index of the action :returns: (reward, isOver) """ oldlives = self.ale.lives() r = 0 for k in range(self.frame_skip): if k == self.frame_skip - 1: self.last_raw_screen = self._grab_raw_image() r += self.ale.act(self.actions[act]) newlives = self.ale.lives() if self.ale.game_over() or \ (self.live_lost_as_eoe and newlives < oldlives): break self.current_episode_score.feed(r) isOver = self.ale.game_over() if self.live_lost_as_eoe: isOver = isOver or newlives < oldlives if isOver: self.finish_episode() if self.ale.game_over(): self.restart_episode() return (r, isOver)
class ExpReplay(DataFlow, Callback): """ Implement experience replay in the paper `Human-level control through deep reinforcement learning <http://www.nature.com/nature/journal/v518/n7540/full/nature14236.html>`_. This implementation provides the interface as a :class:`DataFlow`. This DataFlow is __not__ fork-safe (thus doesn't support multiprocess prefetching). This implementation assumes that state is batch-able, and the network takes batched inputs. """ def __init__(self, # model, agent_name, state_shape, num_actions, batch_size, memory_size, init_memory_size, init_exploration, update_frequency, pipe_exp2sim, pipe_sim2exp): logger.info('starting expreplay {}'.format(agent_name)) self.init_memory_size = int(init_memory_size) self.context = zmq.Context() # no reply for now # self.exp2sim_socket = self.context.socket(zmq.ROUTER) # self.exp2sim_socket.set_hwm(20) # self.exp2sim_socket.bind(pipe_exp2sim) self.sim2exp_socket = self.context.socket(zmq.PULL) self.sim2exp_socket.set_hwm(2) self.sim2exp_socket.bind(pipe_sim2exp) self.queue = queue.Queue(maxsize=1000) # self.model = model for k, v in locals().items(): if k != 'self': setattr(self, k, v) self.agent_name = agent_name self.exploration = init_exploration self.num_actions = num_actions logger.info("Number of Legal actions: {}, {}".format(*self.num_actions)) self.rng = get_rng(self) self._init_memory_flag = threading.Event() # tell if memory has been initialized # a queue to receive notifications to populate memory self._populate_job_queue = queue.Queue(maxsize=5) self.mem = ReplayMemory(memory_size, state_shape) # self._current_ob, self._action_space = self.get_state_and_action_spaces() self._player_scores = StatCounter() self._current_game_score = StatCounter() def get_recv_thread(self): def f(): msg = self.sim2exp_socket.recv(copy=False).bytes msg = loads(msg) print('{}: received msg'.format(self.agent_name)) try: self.queue.put_nowait(msg) except Exception: logger.info('put queue failed!') # send response or not? recv_thread = LoopThread(f, pausable=False) # recv_thread.daemon = True recv_thread.name = "recv thread" return recv_thread def get_simulator_thread(self): # spawn a separate thread to run policy def populate_job_func(): self._populate_job_queue.get() i = 0 # synchronous training while i < self.update_frequency: if self._populate_exp(): i += 1 time.sleep(0.1) # for _ in range(self.update_frequency): # self._populate_exp() th = ShareSessionThread(LoopThread(populate_job_func, pausable=False)) th.name = "SimulatorThread" return th def _init_memory(self): logger.info("{} populating replay memory with epsilon={} ...".format(self.agent_name, self.exploration)) with get_tqdm(total=self.init_memory_size) as pbar: while len(self.mem) < self.init_memory_size: if self._populate_exp(): pbar.update() self._init_memory_flag.set() def _populate_exp(self): """ populate a transition by epsilon-greedy""" try: # do not wait for an update, this may cause some agents have old replay buffer trained more times before new buffer comes in state, action, reward, isOver, comb_mask, fine_mask = self.queue.get_nowait() self._current_game_score.feed(reward) # print(reward) if isOver: self._player_scores.feed(self._current_game_score.sum) self._current_game_score.reset() self.mem.append(Experience(np.stack(state), action, reward, isOver, comb_mask, np.stack(fine_mask))) return True except queue.Empty: return False def get_data(self): # wait for memory to be initialized self._init_memory_flag.wait() while True: idx = self.rng.randint( self._populate_job_queue.maxsize * self.update_frequency, len(self.mem) - 1, size=self.batch_size) batch_exp = [self.mem.sample(i) for i in idx] yield self._process_batch(batch_exp) self._populate_job_queue.put(1) def _process_batch(self, batch_exp): state = np.asarray([e[0] for e in batch_exp], dtype='float32') action = np.asarray([e[1] for e in batch_exp], dtype='int32') reward = np.asarray([e[2] for e in batch_exp], dtype='float32') isOver = np.asarray([e[3] for e in batch_exp], dtype='bool') comb_mask = np.asarray([e[4] for e in batch_exp], dtype='bool') fine_mask = np.asarray([e[5] for e in batch_exp], dtype='bool') return [state, action, reward, isOver, comb_mask, fine_mask] def _setup_graph(self): self._recv_th = self.get_recv_thread() self._recv_th.start() # self.curr_predictor = self.trainer.get_predictor([self.agent_name + '/state:0', self.agent_name + '_comb_mask:0', self.agent_name + '/fine_mask:0'], [self.agent_name + '/Qvalue:0']) def _before_train(self): logger.info('{}-receive thread started'.format(self.agent_name)) self._simulator_th = self.get_simulator_thread() self._simulator_th.start() self._init_memory() def _trigger(self): from simulator.tools import mean_score_logger v = self._player_scores try: mean, max = v.average, v.max logger.info('{} mean_score: {}'.format(self.agent_name, mean)) mean_score_logger('{} mean_score: {}\n'.format(self.agent_name, mean)) self.trainer.monitors.put_scalar('expreplay/mean_score', mean) self.trainer.monitors.put_scalar('expreplay/max_score', max) except Exception: logger.exception(self.agent_name + " Cannot log training scores.") v.reset()
class AtariPlayer(gym.Env): """ A wrapper for ALE emulator, with configurations to mimic DeepMind DQN settings. Info: score: the accumulated reward in the current game gameOver: True when the current game is Over """ def __init__(self, rom_file, viz=0, frame_skip=4, nullop_start=30, live_lost_as_eoe=True, max_num_frames=0): """ Args: rom_file: path to the rom frame_skip: skip every k frames and repeat the action viz: visualization to be done. Set to 0 to disable. Set to a positive number to be the delay between frames to show. Set to a string to be a directory to store frames. nullop_start: start with random number of null ops. live_losts_as_eoe: consider lost of lives as end of episode. Useful for training. max_num_frames: maximum number of frames per episode. """ super(AtariPlayer, self).__init__() if not os.path.isfile(rom_file) and '/' not in rom_file: rom_file = get_dataset_path('atari_rom', rom_file) assert os.path.isfile(rom_file), \ "rom {} not found. Please download at {}".format(rom_file, ROM_URL) try: ALEInterface.setLoggerMode(ALEInterface.Logger.Error) except AttributeError: if execute_only_once(): logger.warn("You're not using latest ALE") # avoid simulator bugs: https://github.com/mgbellemare/Arcade-Learning-Environment/issues/86 with _ALE_LOCK: self.ale = ALEInterface() self.rng = get_rng(self) self.ale.setInt(b"random_seed", self.rng.randint(0, 30000)) self.ale.setInt(b"max_num_frames_per_episode", max_num_frames) self.ale.setBool(b"showinfo", False) self.ale.setInt(b"frame_skip", 1) self.ale.setBool(b'color_averaging', False) # manual.pdf suggests otherwise. self.ale.setFloat(b'repeat_action_probability', 0.0) # viz setup if isinstance(viz, six.string_types): assert os.path.isdir(viz), viz self.ale.setString(b'record_screen_dir', viz) viz = 0 if isinstance(viz, int): viz = float(viz) self.viz = viz if self.viz and isinstance(self.viz, float): self.windowname = os.path.basename(rom_file) cv2.startWindowThread() cv2.namedWindow(self.windowname) self.ale.loadROM(rom_file.encode('utf-8')) self.width, self.height = self.ale.getScreenDims() self.actions = self.ale.getMinimalActionSet() self.live_lost_as_eoe = live_lost_as_eoe self.frame_skip = frame_skip self.nullop_start = nullop_start self.current_episode_score = StatCounter() self.action_space = spaces.Discrete(len(self.actions)) self.observation_space = spaces.Box(low=0, high=255, shape=(self.height, self.width)) self._restart_episode() def get_action_meanings(self): return [ACTION_MEANING[i] for i in self.actions] def _grab_raw_image(self): """ :returns: the current 3-channel image """ m = self.ale.getScreenRGB() return m.reshape((self.height, self.width, 3)) def _current_state(self): """ :returns: a gray-scale (h, w) uint8 image """ ret = self._grab_raw_image() # max-pooled over the last screen ret = np.maximum(ret, self.last_raw_screen) if self.viz: if isinstance(self.viz, float): cv2.imshow(self.windowname, ret) time.sleep(self.viz) ret = ret.astype('float32') # 0.299,0.587.0.114. same as rgb2y in torch/image ret = cv2.cvtColor(ret, cv2.COLOR_RGB2GRAY) return ret.astype('uint8') # to save some memory def _restart_episode(self): self.current_episode_score.reset() with _ALE_LOCK: self.ale.reset_game() # random null-ops start n = self.rng.randint(self.nullop_start) self.last_raw_screen = self._grab_raw_image() for k in range(n): if k == n - 1: self.last_raw_screen = self._grab_raw_image() self.ale.act(0) def _reset(self): if self.ale.game_over(): self._restart_episode() return self._current_state() def _step(self, act): oldlives = self.ale.lives() r = 0 for k in range(self.frame_skip): if k == self.frame_skip - 1: self.last_raw_screen = self._grab_raw_image() r += self.ale.act(self.actions[act]) newlives = self.ale.lives() if self.ale.game_over() or \ (self.live_lost_as_eoe and newlives < oldlives): break self.current_episode_score.feed(r) trueIsOver = isOver = self.ale.game_over() if self.live_lost_as_eoe: isOver = isOver or newlives < oldlives info = { 'score': self.current_episode_score.sum, 'gameOver': trueIsOver } return self._current_state(), r, isOver, info
def eval_with_funcs(predictors, nr_eval, get_player_fn, directory=None, files_list=None): """ Args: predictors ([PredictorBase]) Runs episodes in parallel, returning statistics about the model performance. """ class Worker(StoppableThread, ShareSessionThread): def __init__(self, func, queue, distErrorQueue): super(Worker, self).__init__() self._func = func self.q = queue self.q_dist = distErrorQueue def func(self, *args, **kwargs): if self.stopped(): raise RuntimeError("stopped!") return self._func(*args, **kwargs) def run(self): with self.default_sess(): player = get_player_fn(directory=directory, task=False, files_list=files_list) while not self.stopped(): try: score, filename, ditance_error, q_values = play_one_episode( player, self.func) # print("Score, ", score) except RuntimeError: return self.queue_put_stoppable(self.q, score) self.queue_put_stoppable(self.q_dist, ditance_error) q = queue.Queue() q_dist = queue.Queue() threads = [Worker(f, q, q_dist) for f in predictors] # start all workers for k in threads: k.start() time.sleep(0.1) # avoid simulator bugs stat = StatCounter() dist_stat = StatCounter() # show progress bar w/ tqdm for _ in tqdm(range(nr_eval), **get_tqdm_kwargs()): r = q.get() stat.feed(r) dist = q_dist.get() dist_stat.feed(dist) logger.info("Waiting for all the workers to finish the last run...") for k in threads: k.stop() for k in threads: k.join() while q.qsize(): r = q.get() stat.feed(r) while q_dist.qsize(): dist = q_dist.get() dist_stat.feed(dist) if stat.count > 0: return (stat.average, stat.max, dist_stat.average, dist_stat.max) return (0, 0, 0, 0)
class MedicalPlayer(gym.Env): """Class that provides 3D medical image environment. This is just an implementation of the classic "agent-environment loop". Each time-step, the agent chooses an action, and the environment returns an observation and a reward.""" def __init__(self, directory=None, viz=False, task=False, files_list=None, screen_dims=(27, 27, 27), history_length=20, multiscale=True, max_num_frames=0, saveGif=False, saveVideo=False, data_type=None): """ :param train_directory: environment or game name :param viz: visualization set to 0 to disable set to +ve number to be the delay between frames to show set to a string to be the directory for storing frames :param screen_dims: shape of the frame cropped from the image to feed it to dqn (d,w,h) - defaults (27,27,27) :param nullop_start: start with random number of null ops :param location_history_length: consider lost of lives as end of episode (useful for training) :max_num_frames: maximum numbe0r of frames per episode. """ # ###################################################################### # ## generate evaluation results from 19 different points # ## save results in csv file # self.csvfile = 'DuelDoubleDQN_multiscale_brain_mri_point_pc_ROI_45_45_45_midl2018.csv' # if not train: # with open(self.csvfile, 'w') as outcsv: # fields = ["filename", "dist_error"] # writer = csv.writer(outcsv) # writer.writerow(map(lambda x: x, fields)) # # x = [0.5,0.25,0.75] # y = [0.5,0.25,0.75] # z = [0.5,0.25,0.75] # self.start_points = [] # for combination in itertools.product(x, y, z): # if 0.5 in combination: self.start_points.append(combination) # self.start_points = itertools.cycle(self.start_points) # self.count_points = 0 # self.total_loc = [] # ###################################################################### super(MedicalPlayer, self).__init__() # inits stat counters self.reset_stat() # counter to limit number of steps per episodes self.cnt = 0 # maximum number of frames (steps) per episodes self.max_num_frames = max_num_frames # stores information: terminal, score, distError self.info = None # option to save display as gif self.saveGif = saveGif self.saveVideo = saveVideo # training flag self.task = task # image dimension (2D/3D) self.screen_dims = screen_dims self.dims = len(self.screen_dims) # multi-scale agent self.multiscale = multiscale #Type of data self.data_type = data_type #directory is file for logging evaluation self.directory = directory # init env dimensions if self.dims == 2: self.width, self.height = screen_dims else: self.width, self.height, self.depth = screen_dims with _ALE_LOCK: self.rng = get_rng(self) # visualization setup if isinstance(viz, six.string_types): # check if viz is a string assert os.path.isdir(viz), viz viz = 0 if isinstance(viz, int): viz = float(viz) self.viz = viz if self.viz and isinstance(self.viz, float): self.viewer = None self.gif_buffer = [] # stat counter to store current score or accumlated reward self.current_episode_score = StatCounter() # get action space and minimal action set self.action_space = spaces.Discrete(6) # change number actions here self.actions = self.action_space.n self.observation_space = spaces.Box(low=0, high=255, shape=self.screen_dims, dtype=np.uint8) # history buffer for storing last locations to check oscilations self._history_length = history_length # initialize rectangle limits from input image coordinates self.rectangle = Rectangle(0, 0, 0, 0, 0, 0) # add your data loader here self.set_dataLoader(files_list) # prepare file sampler self.filepath = None self.HITL_logger = [] self._loc_history = None # reset buffer, terminal, counters, and init new_random_game self._restart_episode() def set_dataLoader(self, files_list): if self.data_type == 'BrainMRI': self.data_loader = filesListBrainMRLandmark elif self.data_type == 'CardiacMRI': self.data_loader = filesListCardioLandmark elif self.data_type == 'FetalUS': self.data_loader = filesListFetalUSLandmark elif self.data_type == "HITL": self.data_loader = fileHITL if self.task == 'play': self.files = self.data_loader(files_list, returnLandmarks=False) else: self.files = self.data_loader(files_list, returnLandmarks=True) self.sampled_files = self.files.sample_circular() def HITL_episode_log(self): """ Method to save episode info for HITL """ log = { 'states': self._loc_history, 'rewards': self._reward_history, 'actions': self._act_history, 'target': self._target_loc, 'img_name': self.filename, 'is_over': [False for i in range(len(self._loc_history) - 1)] + [True], 'resolution': self._res_history, } self.HITL_logger.append(log) def HITL_set_location(self, location, res): """ Method to set the location in the image to that specified in the logs """ self._location = location self.xscale = res self.yscale = res self.zscale = res def reset(self): # with _ALE_LOCK: self._restart_episode() return self._current_state() def _restart_episode(self): """ restart current episoide """ if self.task == 'browse' and self._loc_history: self.HITL_episode_log() self.terminal = False self.reward = 0 self.cnt = 0 # counter to limit number of steps per episodes self.num_games.feed(1) self.current_episode_score.reset() # reset the stat counter self._loc_history = [(0, ) * self.dims] * self._history_length # list of q-value lists self._qvalues_history = [(0, ) * self.actions] * self._history_length self._clear_history() self.new_random_game() def new_random_game(self): """ load image, set dimensions, randomize start point, init _screen, qvals, calc distance to goal """ self.terminal = False self.viewer = None # ###################################################################### # ## generate evaluation results from 19 different points # if self.count_points ==0: # print('\n============== new game ===============\n') # # save results # if self.total_loc: # with open(self.csvfile, 'a') as outcsv: # fields= [self.filename, self.cur_dist] # writer = csv.writer(outcsv) # writer.writerow(map(lambda x: x, fields)) # self.total_loc = [] # # sample a new image # self._image, self._target_loc, self.filepath, self.spacing = next(self.sampled_files) # scale = next(self.start_points) # self.count_points +=1 # else: # self.count_points += 1 # logger.info('count_points {}'.format(self.count_points)) # scale = next(self.start_points) # # x = int(scale[0] * self._image.dims[0]) # y = int(scale[1] * self._image.dims[1]) # z = int(scale[2] * self._image.dims[2]) # logger.info('starting point {}-{}-{}'.format(x,y,z)) # ###################################################################### # sample a new image self._image, self._target_loc, self.filepath, self.spacing = next( self.sampled_files) self.filename = os.path.basename(self.filepath) # multiscale (e.g. start with 3 -> 2 -> 1) # scale can be thought of as sampling stride if self.multiscale: # #cardiac # if self.data_type == 'CardiacMRI': # self.action_step = 6 # self.xscale = 2 # self.yscale = 2 # self.zscale = 2 # #brain or fetal # else: # self.action_step = 9 # self.xscale = 3 # self.yscale = 3 # self.zscale = 3 self.action_step = 9 self.xscale = 3 self.yscale = 3 self.zscale = 3 else: self.action_step = 1 self.xscale = 1 self.yscale = 1 self.zscale = 1 # image volume size self._image_dims = self._image.dims ####################################################################### ## select random starting point # add padding to avoid start right on the border of the image if (self.task == 'train'): skip_thickness = ((int)(self._image_dims[0] / 5), (int)(self._image_dims[1] / 5), (int)(self._image_dims[2] / 5)) else: skip_thickness = (int(self._image_dims[0] / 4), int(self._image_dims[1] / 4), int(self._image_dims[2] / 4)) x = self.rng.randint(0 + skip_thickness[0], self._image_dims[0] - skip_thickness[0]) y = self.rng.randint(0 + skip_thickness[1], self._image_dims[1] - skip_thickness[1]) z = self.rng.randint(0 + skip_thickness[2], self._image_dims[2] - skip_thickness[2]) ####################################################################### self._location = (x, y, z) self._start_location = (x, y, z) self._qvalues = [ 0, ] * self.actions self._screen = self._current_state() if self.task == 'play': self.cur_dist = 0 else: self.cur_dist = self.calcDistance(self._location, self._target_loc, self.spacing) def calcDistance(self, points1, points2, spacing=(1, 1, 1)): """ calculate the distance between two points in mm""" spacing = np.array(spacing) points1 = spacing * np.array(points1) points2 = spacing * np.array(points2) return np.linalg.norm(points1 - points2) def step(self, act, qvalues, viewer=None): """The environment's step function returns exactly what we need. Args: act: Returns: observation (object): an environment-specific object representing your observation of the environment. For example, pixel data from a camera, joint angles and joint velocities of a robot, or the board state in a board game. reward (float): amount of reward achieved by the previous action. The scale varies between environments, but the goal is always to increase your total reward. done (boolean): whether it's time to reset the environment again. Most (but not all) tasks are divided up into well-defined episodes, and done being True indicates the episode has terminated. (For example, perhaps the pole tipped too far, or you lost your last life.) info (dict): diagnostic information useful for debugging. It can sometimes be useful for learning (for example, it might contain the raw probabilities behind the environment's last state change). However, official evaluations of your agent are not allowed to use this for learning. """ self._qvalues = qvalues current_loc = self._location self.terminal = False go_out = False self.viewer = viewer # UP Z+ ----------------------------------------------------------- if (act == 0): next_location = (current_loc[0], current_loc[1], round(current_loc[2] + self.action_step)) if (next_location[2] >= self._image_dims[2]): # print(' trying to go out the image Z+ ',) next_location = current_loc go_out = True # FORWARD Y+ --------------------------------------------------------- if (act == 1): next_location = (current_loc[0], round(current_loc[1] + self.action_step), current_loc[2]) if (next_location[1] >= self._image_dims[1]): # print(' trying to go out the image Y+ ',) next_location = current_loc go_out = True # RIGHT X+ ----------------------------------------------------------- if (act == 2): next_location = (round(current_loc[0] + self.action_step), current_loc[1], current_loc[2]) if next_location[0] >= self._image_dims[0]: # print(' trying to go out the image X+ ',) next_location = current_loc go_out = True # LEFT X- ----------------------------------------------------------- if act == 3: next_location = (round(current_loc[0] - self.action_step), current_loc[1], current_loc[2]) if next_location[0] <= 0: # print(' trying to go out the image X- ',) next_location = current_loc go_out = True # BACKWARD Y- --------------------------------------------------------- if act == 4: next_location = (current_loc[0], round(current_loc[1] - self.action_step), current_loc[2]) if next_location[1] <= 0: # print(' trying to go out the image Y- ',) next_location = current_loc go_out = True # DOWN Z- ----------------------------------------------------------- if act == 5: next_location = (current_loc[0], current_loc[1], round(current_loc[2] - self.action_step)) if next_location[2] <= 0: # print(' trying to go out the image Z- ',) next_location = current_loc go_out = True # --------------------------------------------------------------------- # --------------------------------------------------------------------- # punish -1 reward if the agent tries to go out if (self.task != 'play'): if go_out: self.reward = -1 else: self.reward = self._calc_reward(current_loc, next_location) # update screen, reward ,location, terminal self._location = next_location self._screen = self._current_state() # terminate if the distance is less than 1 during trainig if (self.task == 'train'): if self.cur_dist <= 1: # print('Terminal Condition DISTANCE') self.terminal = True self.num_success.feed(1) # terminate if maximum number of steps is reached self.cnt += 1 if self.cnt >= self.max_num_frames: # print('Terminal Condition NUMBER OF FRAMES') self.terminal = True # update history buffer with new location and qvalues if (self.task != 'play'): self.cur_dist = self.calcDistance(self._location, self._target_loc, self.spacing) self._update_history() # check if agent oscillates if self._oscillate: self._location = self.getBestLocation() self._screen = self._current_state() if self.task != 'play': self.cur_dist = self.calcDistance(self._location, self._target_loc, self.spacing) # multi-scale steps if self.multiscale: if self.xscale > 1: self.adjustMultiScale() # terminate if scale is less than 1 else: self.terminal = True # print("TERMINAL OCCILATE") if self.cur_dist <= 1: self.num_success.feed(1) else: self.terminal = True # print("TERMINAL OCCILATE") if self.cur_dist <= 1: self.num_success.feed(1) # render screen if viz is on with _ALE_LOCK: if self.viz: if isinstance(self.viz, float): self.display() distance_error = self.cur_dist self.current_episode_score.feed(self.reward) # print(self.reward) this is every step of the agent info = { 'score': self.current_episode_score.sum, 'gameOver': self.terminal, 'distError': distance_error, 'filename': self.filename } if self.terminal: # store results when batch evaluation if self.directory: path = self.directory with open(path, 'a') as outcsv: fields = [ info['filename'], info['score'], info['distError'] ] writer = csv.writer(outcsv) writer.writerow(map(lambda x: x, fields)) # ####################################################################### # ## generate evaluation results from 19 different points # if self.terminal: # logger.info(info) # self.total_loc.append(self._location) # if not(self.count_points == 19): # self._restart_episode() # else: # mean_location = np.mean(self.total_loc,axis=0) # logger.info('total_loc {} \n mean_location{}'.format(self.total_loc, mean_location)) # self.cur_dist = self.calcDistance(mean_location, # self._target_loc, # self.spacing) # logger.info('final distance error {} \n'.format(self.cur_dist)) # self.count_points = 0 # ####################################################################### return self._current_state(), self.reward, self.terminal, info def stepManual(self, act, viewer): """ Version of above for browse mode allowing the user to navigate through an uploaded img """ # self._qvalues = qvalues current_loc = self._location self.terminal = False go_out = False self.viewer = viewer self._act = act # -1 passed during init so skip updating current location if act == -1: pass else: # UP Z+ ----------------------------------------------------------- if (act == 0): next_location = (current_loc[0], current_loc[1], round(current_loc[2] + self.action_step)) if (next_location[2] >= self._image_dims[2]): # print(' trying to go out the image Z+ ',) next_location = current_loc go_out = True # FORWARD Y+ --------------------------------------------------------- if (act == 1): next_location = (current_loc[0], round(current_loc[1] + self.action_step), current_loc[2]) if (next_location[1] >= self._image_dims[1]): # print(' trying to go out the image Y+ ',) next_location = current_loc go_out = True # RIGHT X+ ----------------------------------------------------------- if (act == 2): next_location = (round(current_loc[0] + self.action_step), current_loc[1], current_loc[2]) if next_location[0] >= self._image_dims[0]: # print(' trying to go out the image X+ ',) next_location = current_loc go_out = True # LEFT X- ----------------------------------------------------------- if act == 3: next_location = (round(current_loc[0] - self.action_step), current_loc[1], current_loc[2]) if next_location[0] <= 0: # print(' trying to go out the image X- ',) next_location = current_loc go_out = True # BACKWARD Y- --------------------------------------------------------- if act == 4: next_location = (current_loc[0], round(current_loc[1] - self.action_step), current_loc[2]) if next_location[1] <= 0: # print(' trying to go out the image Y- ',) next_location = current_loc go_out = True # DOWN Z- ----------------------------------------------------------- if act == 5: next_location = (current_loc[0], current_loc[1], round(current_loc[2] - self.action_step)) if next_location[2] <= 0: # print(' trying to go out the image Z- ',) next_location = current_loc go_out = True if go_out: self.reward = -1 else: self.reward = self._calc_reward(current_loc, next_location) self._location = next_location self._screen = self._current_state() self.cur_dist = self.calcDistance(self._location, self._target_loc, self.spacing) self._update_history() # render screen if viz is on with _ALE_LOCK: if self.viz: if isinstance(self.viz, float): self.display() return self._current_state() def getBestLocation(self): ''' get best location with best qvalue from last for locations stored in history ''' last_qvalues_history = self._qvalues_history[-4:] last_loc_history = self._loc_history[-4:] best_qvalues = np.max(last_qvalues_history, axis=1) # best_idx = best_qvalues.argmax() best_idx = best_qvalues.argmin() best_location = last_loc_history[best_idx] return best_location def adjustMultiScale(self, higherRes=True): '''Adjusts the agent's step size''' if higherRes: self.xscale -= 1 self.yscale -= 1 self.zscale -= 1 self.action_step = int(self.action_step / 3) else: self.xscale += 1 self.yscale += 1 self.zscale += 1 self.action_step = int(self.action_step * 3) self._clear_history() def _clear_history(self): ''' clear history buffer with current state ''' if self.task == 'browse': self._loc_history = [] self._act_history = [] self._reward_history = [] self._res_history = [] else: self._loc_history = [(0, ) * self.dims] * self._history_length self._qvalues_history = [(0, ) * self.actions ] * self._history_length def _update_history(self): ''' update history buffer with current state ''' if self.task == 'browse': self._loc_history.append(self._location) self._act_history.append(self._act) self._res_history.append(self.xscale) self._reward_history.append(self.reward) else: # update location history self._loc_history[:-1] = self._loc_history[1:] self._loc_history[-1] = self._location # update q-value history self._qvalues_history[:-1] = self._qvalues_history[1:] self._qvalues_history[-1] = self._qvalues def _current_state(self): """ crop image data around current location to update what network sees. update rectangle :return: new state """ # initialize screen with zeros - all background screen = np.zeros((self.screen_dims)).astype(self._image.data.dtype) # screen uses coordinate system relative to origin (0, 0, 0) screen_xmin, screen_ymin, screen_zmin = 0, 0, 0 screen_xmax, screen_ymax, screen_zmax = self.screen_dims # extract boundary locations using coordinate system relative to "global" image # width, height, depth in terms of screen coord system if self.xscale % 2: xmin = self._location[0] - int(self.width * self.xscale / 2) - 1 xmax = self._location[0] + int(self.width * self.xscale / 2) ymin = self._location[1] - int(self.height * self.yscale / 2) - 1 ymax = self._location[1] + int(self.height * self.yscale / 2) zmin = self._location[2] - int(self.depth * self.zscale / 2) - 1 zmax = self._location[2] + int(self.depth * self.zscale / 2) else: xmin = self._location[0] - round(self.width * self.xscale / 2) xmax = self._location[0] + round(self.width * self.xscale / 2) ymin = self._location[1] - round(self.height * self.yscale / 2) ymax = self._location[1] + round(self.height * self.yscale / 2) zmin = self._location[2] - round(self.depth * self.zscale / 2) zmax = self._location[2] + round(self.depth * self.zscale / 2) # check if they violate image boundary and fix it if xmin < 0: xmin = 0 screen_xmin = screen_xmax - len(np.arange(xmin, xmax, self.xscale)) if ymin < 0: ymin = 0 screen_ymin = screen_ymax - len(np.arange(ymin, ymax, self.yscale)) if zmin < 0: zmin = 0 screen_zmin = screen_zmax - len(np.arange(zmin, zmax, self.zscale)) if xmax > self._image_dims[0]: xmax = self._image_dims[0] screen_xmax = screen_xmin + len(np.arange(xmin, xmax, self.xscale)) if ymax > self._image_dims[1]: ymax = self._image_dims[1] screen_ymax = screen_ymin + len(np.arange(ymin, ymax, self.yscale)) if zmax > self._image_dims[2]: zmax = self._image_dims[2] screen_zmax = screen_zmin + len(np.arange(zmin, zmax, self.zscale)) # crop image data to update what network sees # image coordinate system becomes screen coordinates # scale can be thought of as a stride screen[screen_xmin:screen_xmax, screen_ymin:screen_ymax, screen_zmin:screen_zmax] = self._image.data[ xmin:xmax:self.xscale, ymin:ymax:self.yscale, zmin:zmax:self.zscale] # update rectangle limits from input image coordinates # this is what the network sees self.rectangle = Rectangle(xmin, xmax, ymin, ymax, zmin, zmax) return screen def get_plane_z(self, z=0): im = self._image.data[:, :, z] if self.data_type in ['BrainMRI', 'CardiacMRI']: im = np.rot90(im, 1) # Rotate 90 degrees ccw return im def get_plane_x(self, x=0): im = self._image.data[x, :, :] im = np.rot90(im, 1) return im def get_plane_y(self, y=0): im = self._image.data[:, y, :] im = np.rot90(im, 1) return im def _calc_reward(self, current_loc, next_loc): """ Calculate the new reward based on the decrease in euclidean distance to the target location """ curr_dist = self.calcDistance(current_loc, self._target_loc, self.spacing) next_dist = self.calcDistance(next_loc, self._target_loc, self.spacing) return curr_dist - next_dist @property def _oscillate(self): """ Return True if the agent is stuck and oscillating """ counter = Counter(self._loc_history) freq = counter.most_common() if freq[0][0] == (0, 0, 0): if (freq[1][1] > 3): return True else: return False elif (freq[0][1] > 3): return True def get_action_meanings(self): """ return array of integers for actions""" ACTION_MEANING = { 1: "UP", # MOVE Z+ 2: "FORWARD", # MOVE Y+ 3: "RIGHT", # MOVE X+ 4: "LEFT", # MOVE X- 5: "BACKWARD", # MOVE Y- 6: "DOWN", # MOVE Z- } return [ACTION_MEANING[i] for i in self.actions] @property def getScreenDims(self): """ return screen dimensions """ return (self.width, self.height, self.depth) def lives(self): return None def reset_stat(self): """ Reset all statistics counter""" self.stats = defaultdict(list) self.num_games = StatCounter() self.num_success = StatCounter() def display(self, return_rgb_array=False): # get dimensions current_point = self._location target_point = self._target_loc # get image and convert it to pyglet plane = self.get_plane_z(current_point[2]) plane_x = self.get_plane_x(current_point[0]) plane_y = self.get_plane_y(current_point[1]) # rescale image # INTER_NEAREST, INTER_LINEAR, INTER_AREA, INTER_CUBIC, INTER_LANCZOS4 scale_x = 2 scale_y = 2 scale_z = 2 current_point = (current_point[0] * scale_x, current_point[1] * scale_y, current_point[2] * scale_z) if target_point is not None: target_point = (target_point[0] * scale_x, target_point[1] * scale_y, target_point[2] * scale_z) self.rectangle = (self.rectangle[0] * scale_x, self.rectangle[1] * scale_x, self.rectangle[2] * scale_y, self.rectangle[3] * scale_y, self.rectangle[4] * scale_z, self.rectangle[5] * scale_z) img = cv2.resize( plane, (int(scale_x * plane.shape[1]), int(scale_y * plane.shape[0])), interpolation=cv2.INTER_LINEAR) img_x = cv2.resize( plane_x, (int(scale_x * plane_x.shape[1]), int(scale_y * plane_x.shape[0])), interpolation=cv2.INTER_LINEAR) img_y = cv2.resize( plane_y, (int(scale_y * plane_y.shape[1]), int(scale_y * plane_y.shape[0])), interpolation=cv2.INTER_LINEAR) img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB) # congvert to rgb img_x = cv2.cvtColor(img_x, cv2.COLOR_GRAY2RGB) # congvert to rgb img_y = cv2.cvtColor(img_y, cv2.COLOR_GRAY2RGB) # congvert to rgb ######################################################################## # PyQt GUI Code Section # Section of code to get initial value to be stored in a pickle object # (Uncomment if you wish to modify default_data.pickle) # viewer_param = { # "arrs": (img, img_x, img_y), # "filepath": self.filename # } # with open("default_data.pickle", "wb") as f: # viewer_param = pickle.dump(viewer_param, f) # exit() # Sleep until resume (for browse mode) if self.task != 'browse': while self.viewer.right_widget.automatic_mode.thread.pause: time.sleep(0.5) # Check whether thread should be killed (pause) if self.viewer.right_widget.automatic_mode.thread.terminate: exit() # Check whether thread should be killed (general) if self.viewer.right_widget.automatic_mode.thread.terminate: exit() # Need to emit signal here (to draw images) self.viewer.widget.agent_signal.emit({ "arrs": (img, img_x, img_y), "agent_loc": current_point, "target": target_point, "error": self.cur_dist, "scale": self.xscale, "rect": self.rectangle, "task": self.task, "is_terminal": self.terminal, "cnt": self.cnt }) if self.task != 'browse': # Control agent speed if self.viewer.right_widget.automatic_mode.thread.speed == WorkerThread.FAST: time.sleep(0) elif self.viewer.right_widget.automatic_mode.thread.speed == WorkerThread.MEDIUM: time.sleep(0.5) else: time.sleep(1.5) ######################################################################## # save gif if self.saveGif: image_data = pyglet.image.get_buffer_manager( ).get_color_buffer().get_image_data() data = image_data.get_data('RGB', image_data.width * 3) arr = np.array(bytearray(data)).astype('uint8') arr = np.flip( np.reshape(arr, (image_data.height, image_data.width, -1)), 0) im = Image.fromarray(arr) self.gif_buffer.append(im) if not self.terminal: gifname = self.filename.split('.')[0] + '.gif' self.viewer.saveGif(gifname, arr=self.gif_buffer, duration=self.viz) if self.saveVideo: dirname = 'tmp_video' if self.cnt <= 1: if os.path.isdir(dirname): logger.warn( """Log directory {} exists! Use 'd' to delete it. """ .format(dirname)) act = input("select action: d (delete) / q (quit): " ).lower().strip() if act == 'd': shutil.rmtree(dirname, ignore_errors=True) else: raise OSError( "Directory {} exits!".format(dirname)) os.mkdir(dirname) frame = dirname + '/' + '%04d' % self.cnt + '.png' pyglet.image.get_buffer_manager().get_color_buffer().save( frame) if self.terminal: resolution = str(3 * self.viewer.img_width) + 'x' + str( 3 * self.viewer.img_height) save_cmd = [ 'ffmpeg', '-f', 'image2', '-framerate', '30', '-pattern_type', 'sequence', '-start_number', '0', '-r', '6', '-i', dirname + '/%04d.png', '-s', resolution, '-vcodec', 'libx264', '-b:v', '2567k', self.filename + '.mp4' ] subprocess.check_output(save_cmd) shutil.rmtree(dirname, ignore_errors=True)
class AgentBase(GymEnv): def __init__(self, agentIdent, is_train=False, auto_restart = True, **kwargs): # super(AgentBase, self).__init__(name='torcs') self.auto_restart = auto_restart self._isTrain = is_train self._agentIdent = agentIdent self._kwargs = kwargs self._init() def _init(self): logger.info("[{}]: agent init, isTrain={}".format(self._agentIdent, self._isTrain)) self._episodeCount = -1 from tensorpack.utils.utils import get_rng self._rng = get_rng(self) from tensorpack.utils.stats import StatCounter self.reset_stat() self.rwd_counter = StatCounter() self._memorySaver = None save_dir = self._kwargs.pop('save_dir', None) if save_dir is not None: self._memorySaver = MemorySaver(save_dir, self._kwargs.pop('max_save_item', 3), self._kwargs.pop('min_save_score', None), ) self.restart_episode() pass def restart_episode(self): self.rwd_counter.reset() self.__ob = self.reset() def finish_episode(self): score = self.rwd_counter.sum self.stats['score'].append(score) logger.info("episode finished, rewards = {:.3f}, episode = {}, steps = {}" .format(score, self._episodeCount, self._episodeSteps)) def current_state(self): return self.__ob def reset(self): self._episodeCount += 1 ret = self._reset() self._episodeRewards = 0. self._episodeSteps = 0 if self._memorySaver: self._memorySaver.createMemory(self._episodeCount) logger.info("restart, episode={}".format(self._episodeCount)) return ret @abc.abstractmethod def _reset(self): pass def action(self, pred): ob, act, r, isOver, info = self._step(pred) self.rwd_counter.feed(r) if self._memorySaver: self._memorySaver.addCurrent(ob, act, r, isOver) self.__ob = ob self._episodeSteps += 1 self._episodeRewards += r if isOver: self.finish_episode() if self.auto_restart: self.restart_episode() return act, r, isOver @abc.abstractmethod def _step(self, action): raise NotImplementedError def get_action_space(self): raise NotImplementedError
class AgentBase(GymEnv): def __init__(self, agentIdent, is_train=False, auto_restart=True, **kwargs): # super(AgentBase, self).__init__(name='torcs') self.auto_restart = auto_restart self._isTrain = is_train self._agentIdent = agentIdent self._kwargs = kwargs self._init() def _init(self): logger.info("[{}]: agent init, isTrain={}".format( self._agentIdent, self._isTrain)) self._episodeCount = -1 from tensorpack.utils.utils import get_rng self._rng = get_rng(self) from tensorpack.utils.stats import StatCounter self.reset_stat() self.rwd_counter = StatCounter() self._memorySaver = None save_dir = self._kwargs.pop('save_dir', None) if save_dir is not None: self._memorySaver = MemorySaver( save_dir, self._kwargs.pop('max_save_item', 3), self._kwargs.pop('min_save_score', None), ) self.restart_episode() pass def restart_episode(self): self.rwd_counter.reset() self.__ob = self.reset() def finish_episode(self): score = self.rwd_counter.sum self.stats['score'].append(score) logger.info( "episode finished, rewards = {:.3f}, episode = {}, steps = {}". format(score, self._episodeCount, self._episodeSteps)) def current_state(self): return self.__ob def reset(self): self._episodeCount += 1 ret = self._reset() self._episodeRewards = 0. self._episodeSteps = 0 if self._memorySaver: self._memorySaver.createMemory(self._episodeCount) logger.info("restart, episode={}".format(self._episodeCount)) return ret @abc.abstractmethod def _reset(self): pass def action(self, pred): ob, act, r, isOver, info = self._step(pred) self.rwd_counter.feed(r) if self._memorySaver: self._memorySaver.addCurrent(ob, act, r, isOver) self.__ob = ob self._episodeSteps += 1 self._episodeRewards += r if isOver: self.finish_episode() if self.auto_restart: self.restart_episode() return act, r, isOver @abc.abstractmethod def _step(self, action): raise NotImplementedError def get_action_space(self): raise NotImplementedError
class SoccerPlayer(RLEnvironment): """ A wrapper for pygame_soccer emulator. Will automatically restart when a real episode ends (isOver might be just lost of lives but not game over). """ SOCCER_WIDTH = 288 SOCCER_HEIGHT = 192 def __init__(self, viz=0, field=None, partial=False, radius=2, frame_skip=4, image_shape=(84, 84), mode=None, team_size=1, ai_frame_skip=1, raw_env=soccer_environment.SoccerEnvironment): super(SoccerPlayer, self).__init__() if team_size > 1 and mode != None: self.mode = mode.split(',') else: self.mode = [mode] self.field = field self.partial = partial self.viz = viz if self.viz: self.renderer_options = soccer_renderer.RendererOptions( show_display=True, max_fps=10, enable_key_events=True) else: self.renderer_options = None if self.field == 'large': map_path = file_util.resolve_path(__file__, '../data/map/soccer_large.tmx') else: map_path = None self.team_size = team_size self.env_options = soccer_environment.SoccerEnvironmentOptions( team_size=self.team_size, map_path=map_path, ai_frame_skip=ai_frame_skip) self.env = raw_env(env_options=self.env_options, renderer_options=self.renderer_options) self.computer_team_name = self.env.team_names[1] self.player_team_name = self.env.team_names[0] # Partial if self.partial: self.radius = radius self.player_agent_index = self.env.get_agent_index( self.player_team_name, 0) self.actions = self.env.actions self.frame_skip = frame_skip self.image_shape = image_shape self.last_info = {} self.agent_actions = ['STAND'] * (self.team_size * 2) self.changing_counter = 0 self.timestep = 0 self.current_episode_score = StatCounter() self.restart_episode() def _grab_raw_image(self): self.env.render() if self.partial: screenshot = self.env.renderer.get_po_screenshot( self.player_agent_index, self.radius) else: screenshot = self.env.renderer.get_screenshot() return screenshot def _get_computer_actions(self): # Collaborator for i in range(self.team_size): index = self.env.get_agent_index(self.player_team_name, i) action = self.env.state.get_agent_action(index) self.agent_actions[self.team_size * 0 + i] = action # Opponent for i in range(self.team_size): index = self.env.get_agent_index(self.computer_team_name, i) action = self.env.state.get_agent_action(index) self.agent_actions[self.team_size * 1 + i] = action return np.asarray([ self.env.actions.index(act if act else 'STAND') for act in self.agent_actions ]) def _set_opponent_mode(self, mode): for i in range(self.team_size): index = self.env.get_agent_index(self.computer_team_name, i) m = mode[i] self.env.state.set_agent_mode(index, m) def _set_collaborator_mode(self, mode): for i in range(1, self.team_size): index = self.env.get_agent_index(self.player_team_name, i) m = mode[i - 1] self.env.state.set_agent_mode(index, m) def _set_computer_mode(self, mode): if mode[0] == None or len(mode) < self.team_size * 2 - 1: return if mode[0] in ['OFFENVIE', 'DFFENSIVE']: # Collaborator if self.team_size >= 2: self._set_collaborator_mode(mode[:(self.team_size - 1)]) # Opponent self._set_opponent_mode(mode[(self.team_size - 1):]) def current_state(self): ret = self._grab_raw_image() ret = cv2.cvtColor(ret, cv2.COLOR_RGB2GRAY) ret = cv2.resize(ret, self.image_shape) return ret.astype('uint8') # to save some memory def get_action_space(self): return DiscreteActionSpace(len(self.actions)) def finish_episode(self): self.stats['score'].append(self.current_episode_score.sum) def restart_episode(self): self.current_episode_score.reset() self.env.reset() self._set_computer_mode(self.mode) self.last_raw_screen = self._grab_raw_image() self.changing_counter = 0 self.timestep = 0 def action(self, act): ball_pos_agent_old = self.env.state.get_ball_possession() r = 0 ball_poss_old = self.env.state.get_ball_possession()['team_name'] for k in range(self.frame_skip): self.timestep += 1 if k == self.frame_skip - 1: self.last_raw_screen = self._grab_raw_image() if self.mode[0] == 'WEAKCOOP': actions = {} for team_name in self.env.team_names: for team_agent_index in range(self.env.options.team_size): agent_index = self.env.get_agent_index( team_name, team_agent_index) agent_action = self.env._get_ai_action( team_name, team_agent_index) print(team_name + self.env.state.get_agent_mode(agent_index)) actions[agent_index] = agent_action player_index = self.env.get_agent_index( self.player_team_name, 0) coop_index = self.env.get_agent_index(self.player_team_name, 1) actions[player_index] = self.env.actions[act] if random.random() < 0.5: actions[coop_index] = random.choice(self.env.actions) ret = self.env.take_action(actions) elif self.mode[0] == 'ALL_RANDOM': if self.team_size == 1: player_index = self.env.get_agent_index( self.player_team_name, 0) opponent_index = self.env.get_agent_index( self.computer_team_name, 0) actions = { player_index: self.env.actions[act], opponent_index: random.choice(self.env.actions) } else: actions = {} for team_name in [ self.player_team_name, self.computer_team_name ]: for team_index in range(self.team_size): agent_index = self.env.get_agent_index( team_name, team_index) actions[agent_index] = random.choice( self.env.actions) player_index = self.env.get_agent_index( self.player_team_name, 0) actions[player_index] = self.env.actions[act] ret = self.env.take_action(actions) # else: # print(self.env.actions[act]) # ret = self.env.take_action(self.env.actions[act]) else: if self.mode[0] == 'OPPONENT_DYNAMIC': choices = ['OFFENSIVE', 'DEFENSIVE'] if self.timestep % random.randint(4, 10) == 0: new_modes = [ random.choice(choices) for i in range(self.team_size) ] self._set_opponent_mode(new_modes) if self.mode[0] == 'COOP_DYNAMIC': choices = ['OFFENSIVE', 'DEFENSIVE'] if self.timestep % random.randint(4, 10) == 0: new_modes = [ random.choice(choices) for i in range(self.team_size - 1) ] self._set_collaborator_mode(new_modes) actions = {} for team_name in self.env.team_names: for team_agent_index in range(self.env.options.team_size): agent_index = self.env.get_agent_index( team_name, team_agent_index) agent_action = self.env._get_ai_action( team_name, team_agent_index) # print(team_name + self.env.state.get_agent_mode(agent_index)) actions[agent_index] = agent_action player_index = self.env.get_agent_index( self.player_team_name, 0) actions[player_index] = self.env.actions[act] ret = self.env.take_action(actions) if k == 0: self.last_info['agent_actions'] = self._get_computer_actions() r += ret.reward if self.env.state.is_terminal(): break self.current_episode_score.feed(r) isOver = self.env.state.is_terminal() ball_pos_agent_new = self.env.state.get_ball_possession() if ball_pos_agent_old['team_name'] == ball_pos_agent_new[ 'team_name'] and ball_pos_agent_new['team_name'] == 'PLAYER': if ball_pos_agent_old['team_agent_index'] != ball_pos_agent_new[ 'team_agent_index']: self.changing_counter += 1 if isOver: self.finish_episode() self.restart_episode() return (r, isOver) def get_internal_state(self): return self.last_info def get_changing_counter(self): return self.changing_counter
class ExpReplay(DataFlow, Callback): """ Implement experience replay in the paper `Human-level control through deep reinforcement learning <http://www.nature.com/nature/journal/v518/n7540/full/nature14236.html>`_. This implementation provides the interface as a :class:`DataFlow`. This DataFlow is __not__ fork-safe (thus doesn't support multiprocess prefetching). This implementation assumes that state is batch-able, and the network takes batched inputs. """ def __init__(self, predictor_io_names, player, state_shape, batch_size, memory_size, init_memory_size, init_exploration, update_frequency, frame_history_len): """ Args: predictor_io_names (tuple of list of str): input/output names to predict Q value from state. player (RLEnvironment): the player. update_frequency (int): number of new transitions to add to memory after sampling a batch of transitions for training. frame_history_len (int): length of history frames to concat. Zero-filled initial frames. """ # automatically save args as self.key = value for k, v in locals().items(): if k != 'self': setattr(self, k, v) # override the asignment above with int version self.init_memory_size = int(self.init_memory_size) self.exploration = init_exploration self.num_actions = player.action_space.n logger.info("Number of Legal actions: {}".format(self.num_actions)) self.rng = get_rng(self) self._init_memory_flag = threading.Event( ) # tell if memory has been initialized # a queue to receive notifications to populate memory # TODO why maxsize=5? self._populate_job_queue = queue.Queue(maxsize=5) self.mem = ReplayMemory(memory_size, state_shape, frame_history_len) self._current_ob = self.player.reset() self._player_scores = StatCounter() self._player_IOU = StatCounter() self._player_stuck = StatCounter() self._player_distances = StatCounter() self._player_qvals = StatCounter() self._player_best_qvals = StatCounter() # print("dims of expreplay history ", np.ndim(self.mem.recent_state())) def get_simulator_thread(self): # spawn a separate thread to run policy def populate_job_func(): self._populate_job_queue.get() for _ in range(self.update_frequency): self._populate_exp() th = ShareSessionThread(LoopThread(populate_job_func, pausable=False)) th.name = "SimulatorThread" return th def _init_memory(self): logger.info("Populating replay memory with epsilon={} ...".format( self.exploration)) with get_tqdm(total=self.init_memory_size) as pbar: while len(self.mem) < self.init_memory_size: self._populate_exp() pbar.update() self._init_memory_flag.set() # quickly fill the memory for debug def _fake_init_memory(self): from copy import deepcopy with get_tqdm(total=self.init_memory_size) as pbar: while len(self.mem) < 5: self._populate_exp() pbar.update() while len(self.mem) < self.init_memory_size: self.mem.append(deepcopy(self.mem._hist[0])) pbar.update() self._init_memory_flag.set() def _populate_exp(self): """ populate a transition by epsilon-greedy""" old_s = self._current_ob # build a history state # TODO we don't need history anymore, right? history = self.mem.recent_state() # shp (1, 15, 15, 15) ndim 4 history.append( old_s ) # TODO: history is size 1 at init, should we actually append?? # shp (2, 15, 15, 15) ndim 4 if np.ndim(history) == 4: # 3d states history = np.stack(history, axis=3) # shp (15, 15, 15, 2) ndim 4 # shp of history[None, :, :, :, :] is (1, 15, 15, 15, 2) ndim 5 # assume batched network - this is the bottleneck q_values = self.predictor(history[None, :, :, :, :])[0][0] else: history = np.stack(history, axis=2) # assume batched network - this is the bottleneck q_values = self.predictor(history[None, :, :, :])[0][0] # update q values for qv in q_values: self._player_qvals.feed(qv) self._player_best_qvals.feed(np.max(q_values)) # TODO no longer need history len, right? if self.rng.rand() <= self.exploration or (len(self.mem) <= self.frame_history_len): act = self.rng.choice(range(self.num_actions)) else: # if there is a tie for max, randomly choose between them act = np.random.choice( np.flatnonzero(np.isclose(q_values, q_values.max(), atol=0.5))) # print("pop_experience act {} qvals {}".format(act, q_values)) self._current_ob, reward, isOver, info = self.player.step(act) if isOver: # log these only at the end self._player_scores.feed(info['score']) self._player_IOU.feed(info['IoU']) self._player_stuck.feed(info['stuck']) self.player.reset() self.mem.append(Experience(old_s, act, reward, isOver)) def _debug_sample(self, sample): import cv2 def view_state(comb_state): state = comb_state[:, :, :-1] next_state = comb_state[:, :, 1:] r = np.concatenate( [state[:, :, k] for k in range(self.frame_history_len)], axis=1) r2 = np.concatenate( [next_state[:, :, k] for k in range(self.frame_history_len)], axis=1) r = np.concatenate([r, r2], axis=0) cv2.imshow("state", r) cv2.waitKey() print("Act: ", sample[2], " reward:", sample[1], " isOver: ", sample[3]) if sample[1] or sample[3]: view_state(sample[0]) def get_data(self): # wait for memory to be initialized self._init_memory_flag.wait() while True: idx = self.rng.randint(self._populate_job_queue.maxsize * self.update_frequency, len(self.mem) - self.frame_history_len - 1, size=self.batch_size) batch_exp = [self.mem.sample(i) for i in idx] yield self._process_batch(batch_exp) self._populate_job_queue.put(1) def _process_batch(self, batch_exp): """decompose batch_exp into list of matrices""" state = np.asarray([e[0] for e in batch_exp], dtype='uint8') reward = np.asarray([e[1] for e in batch_exp], dtype='float32') action = np.asarray([e[2] for e in batch_exp], dtype='int8') isOver = np.asarray([e[3] for e in batch_exp], dtype='bool') return [state, action, reward, isOver] def _setup_graph(self): self.predictor = self.trainer.get_predictor(*self.predictor_io_names) def _before_train(self): self._init_memory() self._simulator_th = self.get_simulator_thread() self._simulator_th.start() def _trigger(self): """log player statistics in training periodically""" logger.info("Logging stats... ") scores = self._player_scores qvals = self._player_qvals best_qs = self._player_best_qvals IoU = self._player_IOU try: if scores.count: self.trainer.monitors.put_scalar('expreplay/mean_score', scores.average) self.trainer.monitors.put_scalar('expreplay/max_score', scores.max) if IoU.count: self.trainer.monitors.put_scalar('expreplay/mean_IoU', IoU.average) self.trainer.monitors.put_scalar('expreplay/max_IoU', IoU.max) if qvals.count: self.trainer.monitors.put_scalar('expreplay/max_qval', qvals.max) self.trainer.monitors.put_scalar('expreplay/mean_qval', qvals.average) if best_qs.count: self.trainer.monitors.put_scalar('expreplay/max_best_qval', best_qs.max) self.trainer.monitors.put_scalar('expreplay/mean_best_qval', best_qs.average) except Exception: logger.exception("Cannot log training scores.") scores.reset() IoU.reset() qvals.reset() best_qs.reset() # monitor number of played games and successes of reaching the target if self.player.num_games.count: self.trainer.monitors.put_scalar( 'expreplay/n_games', np.asscalar(self.player.num_games.sum)) else: self.trainer.monitors.put_scalar('expreplay/n_games', 0) if self.player.num_backtracked.count: self.trainer.monitors.put_scalar( 'expreplay/n_backtracked', np.asscalar(self.player.num_backtracked.sum)) else: self.trainer.monitors.put_scalar('expreplay/n_backtracked', 0) if self.player.num_backtracked.count: self.trainer.monitors.put_scalar( 'expreplay/n_backtracked', np.asscalar(self.player.num_backtracked.sum)) else: self.trainer.monitors.put_scalar('expreplay/n_backtracked', 0) # count wall collisions if self.player.num_go_out.count: self.trainer.monitors.put_scalar( 'expreplay/num_go_out', np.asscalar(self.player.num_go_out.sum)) else: self.trainer.monitors.put_scalar('expreplay/num_go_out', 0) if self.player.num_success.count: self.trainer.monitors.put_scalar( 'expreplay/n_success', np.asscalar(self.player.num_success.sum)) self.trainer.monitors.put_scalar( 'expreplay/n_success_ratio', self.player.num_success.sum / self.player.num_games.sum) else: self.trainer.monitors.put_scalar('expreplay/n_success', 0) self.trainer.monitors.put_scalar('expreplay/n_success_ratio', 0) # length of trials if self.player.episode_duration.count: try: self.trainer.monitors.put_scalar( 'expreplay/avg_episode_duration', np.asscalar(self.player.episode_duration.average)) except: self.trainer.monitors.put_scalar( 'expreplay/avg_episode_duration', self.player.episode_duration.average) # count different actions if self.player.num_act0.count: self.trainer.monitors.put_scalar( 'expreplay/num_act0', np.asscalar(self.player.num_act0.sum)) else: self.trainer.monitors.put_scalar('expreplay/num_act0', 0) if self.player.num_act1.count: self.trainer.monitors.put_scalar( 'expreplay/num_act1', np.asscalar(self.player.num_act1.sum)) else: self.trainer.monitors.put_scalar('expreplay/num_act1', 0) if self.player.num_act2.count: self.trainer.monitors.put_scalar( 'expreplay/num_act2', np.asscalar(self.player.num_act2.sum)) else: self.trainer.monitors.put_scalar('expreplay/num_act2', 0) if self.player.num_act3.count: self.trainer.monitors.put_scalar( 'expreplay/num_act3', np.asscalar(self.player.num_act3.sum)) else: self.trainer.monitors.put_scalar('expreplay/num_act3', 0) if self.player.num_act4.count: self.trainer.monitors.put_scalar( 'expreplay/num_act4', np.asscalar(self.player.num_act4.sum)) else: self.trainer.monitors.put_scalar('expreplay/num_act4', 0) if self.player.num_act5.count: self.trainer.monitors.put_scalar( 'expreplay/num_act5', np.asscalar(self.player.num_act5.sum)) else: self.trainer.monitors.put_scalar('expreplay/num_act5', 0) # reset stats after logging to tensorboard self.player.reset_stat()
class SoccerPlayer(RLEnvironment): """ A wrapper for pygame_soccer emulator. Will automatically restart when a real episode ends (isOver might be just lost of lives but not game over). """ SOCCER_WIDTH = 288 SOCCER_HEIGHT = 192 def __init__(self, viz=0, height_range=(None, None), field='large', partial=False, radius=2, frame_skip=4, image_shape=(84, 84), nullop_start=30, mode=None, team_size=2, ai_frame_skip=1): super(SoccerPlayer, self).__init__() self.mode = mode self.field = field self.partial = partial self.viz = viz assert mode == None, 'Not impl' assert field == 'large', 'No small 2vs2' if self.viz: self.renderer_options = soccer_renderer.RendererOptions( show_display=True, max_fps=10, enable_key_events=True) else: self.renderer_options = None map_path = file_util.resolve_path(__file__, '../data/map/soccer_large.tmx') self.team_size = team_size self.env_options = soccer_environment.SoccerEnvironmentOptions( team_size=self.team_size, map_path=map_path, ai_frame_skip=ai_frame_skip) self.env = soccer_environment.SoccerEnvironment( env_options=self.env_options, renderer_options=self.renderer_options) self.computer_team_name = self.env.team_names[1] self.player_team_name = self.env.team_names[0] # Partial if self.partial: self.radius = radius self.player_agent_index = self.env.get_agent_index( self.player_team_name, 0) self.width, self.height = self.SOCCER_WIDTH, self.SOCCER_HEIGHT self.actions = self.env.actions self.frame_skip = frame_skip self.nullop_start = nullop_start self.height_range = height_range self.image_shape = image_shape self.last_info = {} self.agent_actions = ['STAND'] * (self.team_size * 2) self.current_episode_score = StatCounter() self.restart_episode() def _get_computer_actions(self): # Collaborator for i in range(self.team_size): index = self.env.get_agent_index(self.player_team_name, i) action = self.env.state.get_agent_action(index) self.agent_actions[self.team_size * 0 + i] = action # Opponent for i in range(self.team_size): index = self.env.get_agent_index(self.computer_team_name, i) action = self.env.state.get_agent_action(index) self.agent_actions[self.team_size * 1 + i] = action return np.asarray([ self.env.actions.index(act if act else 'STAND') for act in self.agent_actions ]) def _grab_raw_image(self): """ :returns: the current 3-channel image """ self.env.render() if self.partial: screenshot = self.env.renderer.get_po_screenshot( self.player_agent_index, self.radius) else: screenshot = self.env.renderer.get_screenshot() return screenshot def current_state(self): """ :returns: a gray-scale (h, w) uint8 image """ ret = self._grab_raw_image() # max-pooled over the last screen #ret = np.maximum(ret, self.last_raw_screen) ''' if self.viz: if isinstance(self.viz, float): cv2.imshow('soccer', ret) cv2.waitKey(1) ''' #ret = ret[self.height_range[0]:self.height_range[1], :].astype('float32') # 0.299,0.587.0.114. same as rgb2y in torch/image ret = cv2.cvtColor(ret, cv2.COLOR_RGB2GRAY) ret = cv2.resize(ret, self.image_shape) return ret.astype('uint8') # to save some memory def get_action_space(self): return DiscreteActionSpace(len(self.actions)) def finish_episode(self): self.stats['score'].append(self.current_episode_score.sum) def restart_episode(self): self.current_episode_score.reset() self.env.reset() self.last_raw_screen = self._grab_raw_image() def action(self, act): """ :param act: an index of the action :returns: (reward, isOver) """ r = 0 for k in range(self.frame_skip): if k == self.frame_skip - 1: self.last_raw_screen = self._grab_raw_image() ret = self.env.take_action(self.env.actions[act]) if k == 0: self.last_info['agent_actions'] = self._get_computer_actions() r += ret.reward if self.env.state.is_terminal(): break self.current_episode_score.feed(r) isOver = self.env.state.is_terminal() if isOver: self.finish_episode() self.restart_episode() return (r, isOver) def get_internal_state(self): return self.last_info
class ExpReplay(DataFlow, Callback): """ Implement experience replay in the paper `Human-level control through deep reinforcement learning <http://www.nature.com/nature/journal/v518/n7540/full/nature14236.html>`_. This implementation provides the interface as a :class:`DataFlow`. This DataFlow is __not__ fork-safe (thus doesn't support multiprocess prefetching). This implementation assumes that state is batch-able, and the network takes batched inputs. """ def __init__(self, predictor_io_names, player, state_shape, batch_size, memory_size, init_memory_size, init_exploration, update_frequency, history_len, arg_type=None): """ Args: predictor_io_names (tuple of list of str): input/output names to predict Q value from state. player (RLEnvironment): the player. update_frequency (int): number of new transitions to add to memory after sampling a batch of transitions for training. history_len (int): length of history frames to concat. Zero-filled initial frames. """ init_memory_size = int(init_memory_size) for k, v in locals().items(): if k != 'self': setattr(self, k, v) self.exploration = init_exploration self.num_actions = player.action_space.n logger.info("Number of Legal actions: {}".format(self.num_actions)) self.rng = get_rng(self) self._init_memory_flag = threading.Event() # tell if memory has been initialized # a queue to receive notifications to populate memory self._populate_job_queue = queue.Queue(maxsize=5) self.mem = ReplayMemory(memory_size, state_shape, history_len) ############################################################################### # HITL UPDATE self.hmem_full = False if self.update_frequency < 4: self.hmem = HumanDemReplayMemory(memory_size, state_shape, history_len, arg_type=arg_type) self.hmem.load_experience() self.hmem_full = True logger.info("HITL buffer full") ############################################################################### self._current_ob = self.player.reset() self._player_scores = StatCounter() self._player_distError = StatCounter() def get_simulator_thread(self): # spawn a separate thread to run policy def populate_job_func(): self._populate_job_queue.get() ############################################################################### # HITL UPDATE # as self.update_frequency = 0 during pretraining, no workers will be initialized. ############################################################################### #logger.info("update_frequency: {}".format(self.update_frequency)) for _ in range(int(self.update_frequency)): self._populate_exp() th = ShareSessionThread(LoopThread(populate_job_func, pausable=False)) th.name = "SimulatorThread" return th def _init_memory(self): logger.info("Populating replay memory with epsilon={} ...".format(self.exploration)) with get_tqdm(total=self.init_memory_size) as pbar: while len(self.mem) < self.init_memory_size: self._populate_exp() pbar.update() self._init_memory_flag.set() # quickly fill the memory for debug def _fake_init_memory(self): from copy import deepcopy with get_tqdm(total=self.init_memory_size) as pbar: while len(self.mem) < 5: self._populate_exp() pbar.update() while len(self.mem) < self.init_memory_size: self.mem.append(deepcopy(self.mem._hist[0])) pbar.update() self._init_memory_flag.set() def _populate_exp(self): """ populate a transition by epsilon-greedy""" old_s = self._current_ob # initialize q_values to zeros q_values = [0, ] * self.num_actions if self.rng.rand() <= self.exploration or (len(self.mem) <= self.history_len): act = self.rng.choice(range(self.num_actions)) else: # build a history state history = self.mem.recent_state() history.append(old_s) if np.ndim(history) == 4: # 3d states history = np.stack(history, axis=3) # assume batched network - this is the bottleneck q_values = self.predictor(history[None, :, :, :, :])[0][0] else: history = np.stack(history, axis=2) # assume batched network - this is the bottleneck q_values = self.predictor(history[None, :, :, :])[0][0] act = np.argmax(q_values) self._current_ob, reward, isOver, info = self.player.step(act, q_values) if isOver: # if info['gameOver']: # only record score when a whole game is over (not when an episode is over) # self._player_scores.feed(info['score']) self._player_scores.feed(info['score']) self._player_distError.feed(info['distError']) self.player.reset() # As generated by AI human = False self.mem.append(Experience(old_s, act, reward, isOver, False)) def _debug_sample(self, sample): import cv2 def view_state(comb_state): state = comb_state[:, :, :-1] next_state = comb_state[:, :, 1:] r = np.concatenate([state[:, :, k] for k in range(self.history_len)], axis=1) r2 = np.concatenate([next_state[:, :, k] for k in range(self.history_len)], axis=1) r = np.concatenate([r, r2], axis=0) cv2.imshow("state", r) cv2.waitKey() print("Act: ", sample[2], " reward:", sample[1], " isOver: ", sample[3]) if sample[1] or sample[3]: view_state(sample[0]) def get_data(self): # wait for memory to be initialized self._init_memory_flag.wait() ############################################################################### # HITL UPDATE # if self.update_frequency == 0: # logger.info("logging update freq ...".format(self.update_frequency)) while True: # Pretraining only sampling from HITL buffer if self.update_frequency == 0: idx = self.rng.randint( self._populate_job_queue.maxsize * 4, len(self.hmem)- self.history_len - 1, size=self.batch_size) batch_exp = [self.hmem.sample(i) for i in idx] yield self._process_batch(batch_exp) logger.info("Human batch ...") self._populate_job_queue.put(1) # After pretraining sampling from both HITL and agent buffer elif self.hmem_full == True: ex_idx = self.rng.randint( self._populate_job_queue.maxsize * self.update_frequency, len(self.mem) - self.history_len - 1, size=38) #38 hu_idx = self.rng.randint( self._populate_job_queue.maxsize * 4, len(self.hmem)- self.history_len - 1, size=10) #10 batch_exp = [self.mem.sample(i) for i in ex_idx] for j in hu_idx: batch_exp.append(self.hmem.sample(j)) yield self._process_batch(batch_exp) logger.info("Mixed batch 0.8agent 0.2human ...") self._populate_job_queue.put(1) # HITL not implemented therefore only sample from agent buffer else: idx = self.rng.randint( self._populate_job_queue.maxsize * self.update_frequency, len(self.mem) - self.history_len - 1, size=self.batch_size) batch_exp = [self.mem.sample(i) for i in idx] yield self._process_batch(batch_exp) self._populate_job_queue.put(1) def _process_batch(self, batch_exp): state = np.asarray([e[0] for e in batch_exp], dtype='uint8') reward = np.asarray([e[1] for e in batch_exp], dtype='float32') action = np.asarray([e[2] for e in batch_exp], dtype='int8') isOver = np.asarray([e[3] for e in batch_exp], dtype='bool') human = np.asarray([e[4] for e in batch_exp], dtype='bool') return [state, action, reward, isOver, human] def _setup_graph(self): self.predictor = self.trainer.get_predictor(*self.predictor_io_names) def _before_train(self): self._init_memory() self._simulator_th = self.get_simulator_thread() self._simulator_th.start() def _trigger(self): # log player statistics in training v = self._player_scores dist = self._player_distError try: mean, max = v.average, v.max self.trainer.monitors.put_scalar('expreplay/mean_score', mean) self.trainer.monitors.put_scalar('expreplay/max_score', max) mean, max = dist.average, dist.max self.trainer.monitors.put_scalar('expreplay/mean_dist', mean) self.trainer.monitors.put_scalar('expreplay/max_dist', max) except Exception: logger.exception("Cannot log training scores.") v.reset() dist.reset() # monitor number of played games and successes of reaching the target if self.player.num_games.count: self.trainer.monitors.put_scalar('n_games', np.asscalar(self.player.num_games.sum)) else: self.trainer.monitors.put_scalar('n_games', 0) if self.player.num_success.count: self.trainer.monitors.put_scalar('n_success', np.asscalar(self.player.num_success.sum)) self.trainer.monitors.put_scalar('n_success_ratio', self.player.num_success.sum / self.player.num_games.sum) else: self.trainer.monitors.put_scalar('n_success', 0) self.trainer.monitors.put_scalar('n_success_ratio', 0) # reset stats self.player.reset_stat()
class ExpReplay(DataFlow, Callback): """ Implement experience replay in the paper `Human-level control through deep reinforcement learning <http://www.nature.com/nature/journal/v518/n7540/full/nature14236.html>`_. This implementation provides the interface as a :class:`DataFlow`. This DataFlow is __not__ fork-safe (thus doesn't support multiprocess prefetching). This implementation assumes that state is batch-able, and the network takes batched inputs. """ def __init__(self, # model, agent_name, player, state_shape, num_actions, batch_size, memory_size, init_memory_size, init_exploration, update_frequency, encoding_file='../AutoEncoder/encoding.npy'): init_memory_size = int(init_memory_size) # self.model = model for k, v in locals().items(): if k != 'self': setattr(self, k, v) self.agent_name = agent_name self.exploration = init_exploration self.num_actions = num_actions self.encoding = np.load(encoding_file) logger.info("Number of Legal actions: {}, {}".format(*self.num_actions)) self.rng = get_rng(self) self._init_memory_flag = threading.Event() # tell if memory has been initialized # a queue to receive notifications to populate memory self._populate_job_queue = queue.Queue(maxsize=5) self.mem = ReplayMemory(memory_size, state_shape) self.player.reset() self.player.prepare() self._comb_mask = True self._fine_mask = None self._current_ob, self._action_space = self.get_state_and_action_spaces() self._player_scores = StatCounter() self._current_game_score = StatCounter() def get_combinations(self, curr_cards_char, last_cards_char): if len(curr_cards_char) > 10: card_mask = Card.char2onehot60(curr_cards_char).astype(np.uint8) mask = augment_action_space_onehot60 a = np.expand_dims(1 - card_mask, 0) * mask invalid_row_idx = set(np.where(a > 0)[0]) if len(last_cards_char) == 0: invalid_row_idx.add(0) valid_row_idx = [i for i in range(len(augment_action_space)) if i not in invalid_row_idx] mask = mask[valid_row_idx, :] idx_mapping = dict(zip(range(mask.shape[0]), valid_row_idx)) # augment mask # TODO: known issue: 555444666 will not decompose into 5554 and 66644 combs = get_combinations_nosplit(mask, card_mask) combs = [([] if len(last_cards_char) == 0 else [0]) + [clamp_action_idx(idx_mapping[idx]) for idx in comb] for comb in combs] if len(last_cards_char) > 0: idx_must_be_contained = set( [idx for idx in valid_row_idx if CardGroup.to_cardgroup(augment_action_space[idx]). \ bigger_than(CardGroup.to_cardgroup(last_cards_char))]) combs = [comb for comb in combs if not idx_must_be_contained.isdisjoint(comb)] self._fine_mask = np.zeros([len(combs), self.num_actions[1]], dtype=np.bool) for i in range(len(combs)): for j in range(len(combs[i])): if combs[i][j] in idx_must_be_contained: self._fine_mask[i][j] = True else: self._fine_mask = None else: mask = get_mask_onehot60(curr_cards_char, action_space, None).reshape(len(action_space), 15, 4).sum(-1).astype( np.uint8) valid = mask.sum(-1) > 0 cards_target = Card.char2onehot60(curr_cards_char).reshape(-1, 4).sum(-1).astype(np.uint8) combs = get_combinations_recursive(mask[valid, :], cards_target) idx_mapping = dict(zip(range(valid.shape[0]), np.where(valid)[0])) combs = [([] if len(last_cards_char) == 0 else [0]) + [idx_mapping[idx] for idx in comb] for comb in combs] if len(last_cards_char) > 0: valid[0] = True idx_must_be_contained = set( [idx for idx in range(len(action_space)) if valid[idx] and CardGroup.to_cardgroup(action_space[idx]). \ bigger_than(CardGroup.to_cardgroup(last_cards_char))]) combs = [comb for comb in combs if not idx_must_be_contained.isdisjoint(comb)] self._fine_mask = np.zeros([len(combs), self.num_actions[1]], dtype=np.bool) for i in range(len(combs)): for j in range(len(combs[i])): if combs[i][j] in idx_must_be_contained: self._fine_mask[i][j] = True else: self._fine_mask = None return combs def subsample_combs_masks(self, combs, masks, num_sample): if masks is not None: assert len(combs) == masks.shape[0] idx = np.random.permutation(len(combs))[:num_sample] return [combs[i] for i in idx], (masks[idx] if masks is not None else None) def get_state_and_action_spaces(self, action=None): def cards_char2embedding(cards_char): test = (action_space_onehot60 == Card.char2onehot60(cards_char)) test = np.all(test, axis=1) target = np.where(test)[0] return self.encoding[target[0]] last_two_cards_char = self.player.get_last_two_cards() last_cards_char = last_two_cards_char[0] if not last_cards_char: last_cards_char = last_two_cards_char[1] curr_cards_char = self.player.get_curr_handcards() if self._comb_mask: # print(curr_cards_char, last_cards_char) combs = self.get_combinations(curr_cards_char, last_cards_char) if len(combs) > self.num_actions[0]: combs, self._fine_mask = self.subsample_combs_masks(combs, self._fine_mask, self.num_actions[0]) # TODO: utilize temporal relations to speedup available_actions = [[action_space[idx] for idx in comb] for comb in combs] # print(available_actions) # print('-------------------------------------------') assert len(combs) > 0 if self._fine_mask is not None: self._fine_mask = self.pad_fine_mask(self._fine_mask) self.pad_action_space(available_actions) state = [np.stack([self.encoding[idx] for idx in comb]) for comb in combs] assert len(state) > 0 prob_state = self.player.get_state_prob() # test = action_space_onehot60 == Card.char2onehot60(last_cards_char) # test = np.all(test, axis=1) # target = np.where(test)[0] # assert target.size == 1 extra_state = np.concatenate([cards_char2embedding(last_two_cards_char[0]), cards_char2embedding(last_two_cards_char[1]), prob_state]) for i in range(len(state)): state[i] = np.concatenate([state[i], np.tile(extra_state[None, :], [state[i].shape[0], 1])], axis=-1) state = self.pad_state(state) assert state.shape[0] == self.num_actions[0] and state.shape[1] == self.num_actions[1] else: assert action is not None if self._fine_mask is not None: self._fine_mask = self._fine_mask[action] available_actions = self._action_space[action] state = self._current_ob[action:action+1, :, :] state = np.repeat(state, self.num_actions[0], axis=0) assert state.shape[0] == self.num_actions[0] and state.shape[1] == self.num_actions[1] return state, available_actions def pad_fine_mask(self, mask): if mask.shape[0] < self.num_actions[0]: mask = np.concatenate([mask, np.repeat(mask[-1:], self.num_actions[0] - mask.shape[0], 0)], 0) return mask def pad_action_space(self, available_actions): # print(available_actions) for i in range(len(available_actions)): available_actions[i] += [available_actions[i][-1]] * (self.num_actions[1] - len(available_actions[i])) if len(available_actions) < self.num_actions[0]: available_actions.extend([available_actions[-1]] * (self.num_actions[0] - len(available_actions))) # input is a list of N * HIDDEN_STATE def pad_state(self, state): # since out net uses max operation, we just dup the last row and keep the result same newstates = [] for s in state: assert s.shape[0] <= self.num_actions[1] s = np.concatenate([s, np.repeat(s[-1:, :], self.num_actions[1] - s.shape[0], axis=0)], axis=0) newstates.append(s) newstates = np.stack(newstates, axis=0) if len(state) < self.num_actions[0]: state = np.concatenate([newstates, np.repeat(newstates[-1:, :, :], self.num_actions[0] - newstates.shape[0], axis=0)], axis=0) else: state = newstates return state def get_simulator_thread(self): # spawn a separate thread to run policy def populate_job_func(): self._populate_job_queue.get() for _ in range(self.update_frequency): self._populate_exp() th = ShareSessionThread(LoopThread(populate_job_func, pausable=False)) th.name = "SimulatorThread" return th def _init_memory(self): logger.info("Populating replay memory with epsilon={} ...".format(self.exploration)) with get_tqdm(total=self.init_memory_size) as pbar: while len(self.mem) < self.init_memory_size: self._populate_exp() pbar.update() self._init_memory_flag.set() def _populate_exp(self): """ populate a transition by epsilon-greedy""" old_s = self._current_ob comb_mask = self._comb_mask if not self._comb_mask and self._fine_mask is not None: fine_mask = self._fine_mask if self._fine_mask.shape[0] == max(self.num_actions[0], self.num_actions[1]) \ else np.pad(self._fine_mask, (0, max(self.num_actions[0], self.num_actions[1]) - self._fine_mask.shape[0]), 'constant', constant_values=(0, 0)) else: fine_mask = np.ones([max(self.num_actions[0], self.num_actions[1])], dtype=np.bool) last_cards_char = self.player.get_last_outcards() if self.rng.rand() <= self.exploration: if not self._comb_mask and self._fine_mask is not None: q_values = np.random.rand(self.num_actions[1]) q_values[np.where(np.logical_not(self._fine_mask))[0]] = np.nan act = np.nanargmax(q_values) # print(q_values) # print(act) else: act = self.rng.choice(range(self.num_actions[0 if comb_mask else 1])) else: q_values = self.curr_predictor(old_s[None, :, :, :], np.array([comb_mask]), np.array([fine_mask]))[0][0] if not self._comb_mask and self._fine_mask is not None: q_values = q_values[:self.num_actions[1]] assert np.all(q_values[np.where(np.logical_not(self._fine_mask))[0]] < -100) q_values[np.where(np.logical_not(self._fine_mask))[0]] = np.nan act = np.nanargmax(q_values) assert act < self.num_actions[0 if comb_mask else 1] # print(q_values) # print(act) # clamp action to valid range act = min(act, self.num_actions[0 if comb_mask else 1] - 1) winner = -1 reward = 0 if comb_mask: isOver = False else: if len(last_cards_char) > 0: if act > 0: if not CardGroup.to_cardgroup(self._action_space[act]).bigger_than(CardGroup.to_cardgroup(last_cards_char)): print('warning, some error happened, ', self._action_space[act], last_cards_char) raise Exception("card comparison error") winner, isOver = self.player.step(self._action_space[act]) # step for AI farmers while not isOver and self.player.get_curr_agent_name() != self.agent_name: handcards = self.player.get_curr_handcards() last_two_cards = self.player.get_last_two_cards() prob_state = self.player.get_state_prob() action = self.predictors[self.player.get_curr_agent_name()].predict(handcards, last_two_cards, prob_state) winner, isOver = self.player.step(action) if isOver: if self.agent_name == winner: reward = 1 else: if self.player.get_all_agent_names().index(winner) + self.player.get_all_agent_names().index(self.agent_name) == 3: reward = 1 else: reward = -1 self._current_game_score.feed(reward) if isOver: self._player_scores.feed(self._current_game_score.sum) self.player.reset() self.player.prepare() self._comb_mask = True self.prestart() self._current_game_score.reset() else: self._comb_mask = not self._comb_mask self._current_ob, self._action_space = self.get_state_and_action_spaces(act if not self._comb_mask else None) self.mem.append(Experience(old_s, act, reward, isOver, comb_mask, fine_mask)) def prestart(self): while self.player.get_curr_agent_name() != self.agent_name: handcards = self.player.get_curr_handcards() last_two_cards = self.player.get_last_two_cards() prob_state = self.player.get_state_prob() action = self.predictors[self.player.get_curr_agent_name()].predict(handcards, last_two_cards, prob_state) self.player.step(action) self._current_ob, self._action_space = self.get_state_and_action_spaces() def get_data(self): # wait for memory to be initialized self._init_memory_flag.wait() while True: idx = self.rng.randint( self._populate_job_queue.maxsize * self.update_frequency, len(self.mem) - 1, size=self.batch_size) batch_exp = [self.mem.sample(i) for i in idx] yield self._process_batch(batch_exp) self._populate_job_queue.put(1) def _process_batch(self, batch_exp): state = np.asarray([e[0] for e in batch_exp], dtype='float32') action = np.asarray([e[1] for e in batch_exp], dtype='int32') reward = np.asarray([e[2] for e in batch_exp], dtype='float32') isOver = np.asarray([e[3] for e in batch_exp], dtype='bool') comb_mask = np.asarray([e[4] for e in batch_exp], dtype='bool') fine_mask = np.asarray([e[5] for e in batch_exp], dtype='bool') return [state, action, reward, isOver, comb_mask, fine_mask] def _setup_graph(self): self.curr_predictor = self.trainer.get_predictor([self.agent_name + '/state:0', self.agent_name + '_comb_mask:0', self.agent_name + '/fine_mask:0'], [self.agent_name + '/Qvalue:0']) self.predictors = {n: Predictor(self.trainer.get_predictor([n + '/state:0', n + '_comb_mask:0', n + '/fine_mask:0'], [n + '/Qvalue:0'])) for n in self.player.get_all_agent_names()} def _before_train(self): self.prestart() self._init_memory() self._simulator_th = self.get_simulator_thread() self._simulator_th.start() def _trigger(self): v = self._player_scores try: mean, max = v.average, v.max self.trainer.monitors.put_scalar('expreplay/mean_score', mean) self.trainer.monitors.put_scalar('expreplay/max_score', max) except Exception: logger.exception(self.agent_name + " Cannot log training scores.") v.reset()
class ExpReplay(DataFlow, Callback): """ Implement experience replay in the paper `Human-level control through deep reinforcement learning <http://www.nature.com/nature/journal/v518/n7540/full/nature14236.html>`_. This implementation provides the interface as a :class:`DataFlow`. This DataFlow is __not__ fork-safe (thus doesn't support multiprocess prefetching). This implementation assumes that state is batch-able, and the network takes batched inputs. """ def __init__(self, predictor_io_names, player, state_shape, batch_size, memory_size, init_memory_size, init_exploration, update_frequency, history_len): """ Args: predictor_io_names (tuple of list of str): input/output names to predict Q value from state. player (RLEnvironment): the player. history_len (int): length of history frames to concat. Zero-filled initial frames. update_frequency (int): number of new transitions to add to memory after sampling a batch of transitions for training. """ init_memory_size = int(init_memory_size) for k, v in locals().items(): if k != 'self': setattr(self, k, v) self.exploration = init_exploration self.num_actions = player.action_space.n logger.info("Number of Legal actions: {}".format(self.num_actions)) self.rng = get_rng(self) self._init_memory_flag = threading.Event() # tell if memory has been initialized # a queue to receive notifications to populate memory self._populate_job_queue = queue.Queue(maxsize=5) self.mem = ReplayMemory(memory_size, state_shape, history_len) self._current_ob = self.player.reset() self._player_scores = StatCounter() self._current_game_score = StatCounter() def get_simulator_thread(self): # spawn a separate thread to run policy def populate_job_func(): self._populate_job_queue.get() for _ in range(self.update_frequency): self._populate_exp() th = ShareSessionThread(LoopThread(populate_job_func, pausable=False)) th.name = "SimulatorThread" return th def _init_memory(self): logger.info("Populating replay memory with epsilon={} ...".format(self.exploration)) with get_tqdm(total=self.init_memory_size) as pbar: while len(self.mem) < self.init_memory_size: self._populate_exp() pbar.update() self._init_memory_flag.set() # quickly fill the memory for debug def _fake_init_memory(self): from copy import deepcopy with get_tqdm(total=self.init_memory_size) as pbar: while len(self.mem) < 5: self._populate_exp() pbar.update() while len(self.mem) < self.init_memory_size: self.mem.append(deepcopy(self.mem._hist[0])) pbar.update() self._init_memory_flag.set() def _populate_exp(self): """ populate a transition by epsilon-greedy""" old_s = self._current_ob if self.rng.rand() <= self.exploration or (len(self.mem) <= self.history_len): act = self.rng.choice(range(self.num_actions)) else: # build a history state history = self.mem.recent_state() history.append(old_s) history = np.stack(history, axis=2) # assume batched network q_values = self.predictor(history[None, :, :, :])[0][0] # this is the bottleneck act = np.argmax(q_values) self._current_ob, reward, isOver, info = self.player.step(act) self._current_game_score.feed(reward) if isOver: if info['ale.lives'] == 0: # only record score when a whole game is over (not when an episode is over) self._player_scores.feed(self._current_game_score.sum) self._current_game_score.reset() self.player.reset() self.mem.append(Experience(old_s, act, reward, isOver)) def _debug_sample(self, sample): import cv2 def view_state(comb_state): state = comb_state[:, :, :-1] next_state = comb_state[:, :, 1:] r = np.concatenate([state[:, :, k] for k in range(self.history_len)], axis=1) r2 = np.concatenate([next_state[:, :, k] for k in range(self.history_len)], axis=1) r = np.concatenate([r, r2], axis=0) cv2.imshow("state", r) cv2.waitKey() print("Act: ", sample[2], " reward:", sample[1], " isOver: ", sample[3]) if sample[1] or sample[3]: view_state(sample[0]) def _process_batch(self, batch_exp): state = np.asarray([e[0] for e in batch_exp], dtype='uint8') reward = np.asarray([e[1] for e in batch_exp], dtype='float32') action = np.asarray([e[2] for e in batch_exp], dtype='int8') isOver = np.asarray([e[3] for e in batch_exp], dtype='bool') return [state, action, reward, isOver] # DataFlow method: def get_data(self): # wait for memory to be initialized self._init_memory_flag.wait() while True: idx = self.rng.randint( self._populate_job_queue.maxsize * self.update_frequency, len(self.mem) - self.history_len - 1, size=self.batch_size) batch_exp = [self.mem.sample(i) for i in idx] yield self._process_batch(batch_exp) self._populate_job_queue.put(1) # Callback methods: def _setup_graph(self): self.predictor = self.trainer.get_predictor(*self.predictor_io_names) def _before_train(self): self._init_memory() self._simulator_th = self.get_simulator_thread() self._simulator_th.start() def _trigger(self): v = self._player_scores try: mean, max = v.average, v.max self.trainer.monitors.put_scalar('expreplay/mean_score', mean) self.trainer.monitors.put_scalar('expreplay/max_score', max) except Exception: logger.exception("Cannot log training scores.") v.reset()
class ExpReplay(DataFlow, Callback): """ Implement experience replay in the paper `Human-level control through deep reinforcement learning <http://www.nature.com/nature/journal/v518/n7540/full/nature14236.html>`_. This implementation provides the interface as a :class:`DataFlow`. This DataFlow is __not__ fork-safe (thus doesn't support multiprocess prefetching). This implementation assumes that state is batch-able, and the network takes batched inputs. """ def __init__(self, predictor_io_names, player, state_shape, num_actions, batch_size, memory_size, init_memory_size, init_exploration, update_frequency, encoding_file='../AutoEncoder/encoding.npy'): """ Args: predictor_io_names (tuple of list of str): input/output names to predict Q value from state. player (RLEnvironment): the player. history_len (int): length of history frames to concat. Zero-filled initial frames. update_frequency (int): number of new transitions to add to memory after sampling a batch of transitions for training. """ init_memory_size = int(init_memory_size) for k, v in locals().items(): if k != 'self': setattr(self, k, v) self.exploration = init_exploration self.num_actions = num_actions self.encoding = np.load(encoding_file) logger.info("Number of Legal actions: {}".format(self.num_actions)) self.rng = get_rng(self) self._init_memory_flag = threading.Event( ) # tell if memory has been initialized # a queue to receive notifications to populate memory self._populate_job_queue = queue.Queue(maxsize=5) self.mem = ReplayMemory(memory_size, state_shape) self.player.reset() # init_cards = np.arange(36) # self.player.prepare_manual(init_cards) self.player.prepare() # self._current_ob = self.player.get_state_prob() self._current_ob = self.get_state() self._player_scores = StatCounter() self._current_game_score = StatCounter() def get_state(self): def cards_char2embedding(cards_char): test = (action_space_onehot60 == Card.char2onehot60(cards_char)) test = np.all(test, axis=1) target = np.where(test)[0] return self.encoding[target[0]] s = self.player.get_state_prob() s = np.concatenate( [Card.val2onehot60(self.player.get_curr_handcards()), s]) last_two_cards_char = self.player.get_last_two_cards() last_two_cards_char = [to_char(c) for c in last_two_cards_char] return np.concatenate([ s, cards_char2embedding(last_two_cards_char[0]), cards_char2embedding(last_two_cards_char[1]) ]) def get_simulator_thread(self): # spawn a separate thread to run policy def populate_job_func(): self._populate_job_queue.get() for _ in range(self.update_frequency): self._populate_exp() th = ShareSessionThread(LoopThread(populate_job_func, pausable=False)) th.name = "SimulatorThread" return th def _init_memory(self): logger.info("Populating replay memory with epsilon={} ...".format( self.exploration)) with get_tqdm(total=self.init_memory_size) as pbar: while len(self.mem) < self.init_memory_size: self._populate_exp() pbar.update() self._init_memory_flag.set() def _populate_exp(self): """ populate a transition by epsilon-greedy""" old_s = self._current_ob if self.rng.rand() <= self.exploration: act = self.rng.choice(range(self.num_actions)) else: mask = get_mask(to_char(self.player.get_curr_handcards()), action_space, to_char(self.player.get_last_outcards())) q_values = self.predictor(old_s[None, ...])[0][0] q_values[mask == 0] = np.nan act = np.nanargmax(q_values) assert act < self.num_actions reward, isOver, _ = self.player.step_manual(to_value( action_space[act])) # step for AI while not isOver and self.player.get_role_ID() != ROLE_ID_TO_TRAIN: _, reward, _ = self.player.step_auto() isOver = (reward != 0) if ROLE_ID_TO_TRAIN == 2: reward = -reward self._current_game_score.feed(reward) if isOver: # print('lord wins' if reward > 0 else 'farmer wins') self._player_scores.feed(self._current_game_score.sum) # print(self._current_game_score.sum) while True: self.player.reset() # init_cards = np.arange(36) # self.player.prepare_manual(init_cards) self.player.prepare() early_stop = False while self.player.get_role_ID() != ROLE_ID_TO_TRAIN: _, reward, _ = self.player.step_auto() isOver = (reward != 0) if isOver: print('prestart ends too early! now resetting env') early_stop = True break if early_stop: continue self._current_ob = self.get_state() break self._current_game_score.reset() self._current_ob = self.get_state() self.mem.append(Experience(old_s, act, reward, isOver)) def debug(self, cnt=100000): with get_tqdm(total=cnt) as pbar: for i in range(cnt): self.mem.append( Experience( np.zeros( [self.num_actions[0], self.num_actions[1], 256]), 0, 0)) # self._current_ob, self._action_space = self.get_state_and_action_spaces(None) pbar.update() def get_data(self): # wait for memory to be initialized self._init_memory_flag.wait() while True: idx = self.rng.randint(self._populate_job_queue.maxsize * self.update_frequency, len(self.mem) - 1, size=self.batch_size) batch_exp = [self.mem.sample(i) for i in idx] yield self._process_batch(batch_exp) self._populate_job_queue.put(1) def _process_batch(self, batch_exp): state = np.asarray([e[0] for e in batch_exp], dtype='float32') action = np.asarray([e[1] for e in batch_exp], dtype='int32') reward = np.asarray([e[2] for e in batch_exp], dtype='float32') isOver = np.asarray([e[3] for e in batch_exp], dtype='bool') return [state, action, reward, isOver] def _setup_graph(self): self.predictor = self.trainer.get_predictor(*self.predictor_io_names) def _before_train(self): while self.player.get_role_ID() != ROLE_ID_TO_TRAIN: self.player.step_auto() self._current_ob, self._action_space = self.get_state_and_action_spaces( ) self._init_memory() self._simulator_th = self.get_simulator_thread() self._simulator_th.start() def _trigger(self): v = self._player_scores try: mean, max = v.average, v.max self.trainer.monitors.put_scalar('expreplay/mean_score', mean) self.trainer.monitors.put_scalar('expreplay/max_score', max) except Exception: logger.exception("Cannot log training scores.") v.reset()
class AtariPlayer(RLEnvironment): """ A wrapper for atari emulator. Will automatically restart when a real episode ends (isOver might be just lost of lives but not game over). """ def __init__(self, rom_file, viz=0, height_range=(None, None), frame_skip=4, image_shape=(84, 84), nullop_start=30, live_lost_as_eoe=True): """ :param rom_file: path to the rom :param frame_skip: skip every k frames and repeat the action :param image_shape: (w, h) :param height_range: (h1, h2) to cut :param viz: visualization to be done. Set to 0 to disable. Set to a positive number to be the delay between frames to show. Set to a string to be a directory to store frames. :param nullop_start: start with random number of null ops :param live_losts_as_eoe: consider lost of lives as end of episode. useful for training. """ super(AtariPlayer, self).__init__() if not os.path.isfile(rom_file) and '/' not in rom_file: rom_file = get_dataset_path('atari_rom', rom_file) assert os.path.isfile(rom_file), \ "rom {} not found. Please download at {}".format(rom_file, ROM_URL) try: ALEInterface.setLoggerMode(ALEInterface.Logger.Warning) except AttributeError: if execute_only_once(): logger.warn("You're not using latest ALE") # avoid simulator bugs: https://github.com/mgbellemare/Arcade-Learning-Environment/issues/86 with _ALE_LOCK: self.ale = ALEInterface() self.rng = get_rng(self) self.ale.setInt(b"random_seed", self.rng.randint(0, 30000)) self.ale.setBool(b"showinfo", False) self.ale.setInt(b"frame_skip", 1) self.ale.setBool(b'color_averaging', False) # manual.pdf suggests otherwise. self.ale.setFloat(b'repeat_action_probability', 0.0) # viz setup if isinstance(viz, six.string_types): assert os.path.isdir(viz), viz self.ale.setString(b'record_screen_dir', viz) viz = 0 if isinstance(viz, int): viz = float(viz) self.viz = viz if self.viz and isinstance(self.viz, float): self.windowname = os.path.basename(rom_file) cv2.startWindowThread() cv2.namedWindow(self.windowname) self.ale.loadROM(rom_file.encode('utf-8')) self.width, self.height = self.ale.getScreenDims() self.actions = self.ale.getMinimalActionSet() self.live_lost_as_eoe = live_lost_as_eoe self.frame_skip = frame_skip self.nullop_start = nullop_start self.height_range = height_range self.image_shape = image_shape self.current_episode_score = StatCounter() self.restart_episode() def _grab_raw_image(self): """ :returns: the current 3-channel image """ m = self.ale.getScreenRGB() return m.reshape((self.height, self.width, 3)) def current_state(self): """ :returns: a gray-scale (h, w) uint8 image """ ret = self._grab_raw_image() # max-pooled over the last screen ret = np.maximum(ret, self.last_raw_screen) if self.viz: if isinstance(self.viz, float): cv2.imshow(self.windowname, ret) time.sleep(self.viz) ret = ret[self.height_range[0]:self.height_range[1], :].astype('float32') # 0.299,0.587.0.114. same as rgb2y in torch/image ret = cv2.cvtColor(ret, cv2.COLOR_RGB2GRAY) ret = cv2.resize(ret, self.image_shape) return ret.astype('uint8') # to save some memory def get_action_space(self): return DiscreteActionSpace(len(self.actions)) def finish_episode(self): self.stats['score'].append(self.current_episode_score.sum) def restart_episode(self): self.current_episode_score.reset() with _ALE_LOCK: self.ale.reset_game() # random null-ops start n = self.rng.randint(self.nullop_start) self.last_raw_screen = self._grab_raw_image() for k in range(n): if k == n - 1: self.last_raw_screen = self._grab_raw_image() self.ale.act(0) def action(self, act): """ :param act: an index of the action :returns: (reward, isOver) """ oldlives = self.ale.lives() r = 0 for k in range(self.frame_skip): if k == self.frame_skip - 1: self.last_raw_screen = self._grab_raw_image() r += self.ale.act(self.actions[act]) newlives = self.ale.lives() if self.ale.game_over() or \ (self.live_lost_as_eoe and newlives < oldlives): break self.current_episode_score.feed(r) isOver = self.ale.game_over() if self.live_lost_as_eoe: isOver = isOver or newlives < oldlives if isOver: self.finish_episode() if self.ale.game_over(): self.restart_episode() return (r, isOver)
def eval_child(model_cls, args, log_dir, model_dir, collect_hallu_stats=True): """ Args: model_cls (PetridishModel) : args : log_dir (str): where to log model_dir (str) : where to load from collect_hallu_stats (bool) : whether to collect hallu stats if there are any. Return: eval_vals (list) : a list of evaluation related value. The first is the vaildation error on the specified validation set; it is followed by hallucination stats. """ ckpt = tf.train.latest_checkpoint(model_dir) if not ckpt: logger.info("No model exists. Do not sort") return [] args.compute_hallu_stats = True (model, args, ds_val, insrc_val, output_names, output_funcs) = get_training_params(model_cls, args, is_training=False) n_outputs = len(output_names) logger.info("{} num vals present. Will use the final perf {} as eval score".format(\ n_outputs, output_names[-1])) stats_handlers = [StatCounter() for _ in range(n_outputs)] # additional handlers for hallucinations if collect_hallu_stats: hallu_stats_names = get_net_info_hallu_stats_output_names( model.net_info) stats_handlers.extend([StatCounter() for _ in hallu_stats_names]) output_names.extend(hallu_stats_names) # Note at this point stats_handlers[n_outputs-1:] contains all # the value needed for evaluation. # batch size counter sample_counter = StatCounter() # ignore loading certain variables during inference ignore_names = getattr(model, 'load_ignore_var_names', []) pred_config = PredictConfig(model=model, input_names=model._input_names, output_names=output_names, session_init=SaverRestore(ckpt, ignore=ignore_names)) predictor = OfflinePredictor(pred_config) # two types of input, dataflow or input_source if ds_val: gen = ds_val.get_data() ds_val.reset_state() input_sess = None else: if not insrc_val.setup_done(): insrc_val.setup(model.get_inputs_desc()) sess_config = get_default_sess_config() sess_config.device_count['GPU'] = 0 input_tensors = insrc_val.get_input_tensors() sess_creater = tf.train.ChiefSessionCreator(config=sess_config) input_sess = tf.train.MonitoredSession(sess_creater) def _gen_func(): insrc_val.reset_state() for _ in range(insrc_val.size()): yield input_sess.run(input_tensors) gen = _gen_func() for dp_idx, dp in enumerate(gen): output = predictor(*dp) batch_size = output[n_outputs - 1].shape[0] sample_counter.feed(batch_size) for o, handler in zip(output, stats_handlers): handler.feed(np.sum(o)) if (args.debug_steps_per_epoch and dp_idx + 1 >= args.debug_steps_per_epoch): # stop early during debgging break eval_vals = [] N = float(sample_counter.sum) for hi, handler in enumerate(stats_handlers): stat = handler.sum / float(N) logger.info('Stat {} has an avg of {}'.format(hi, stat)) if hi < n_outputs: o_func = output_funcs[hi] if o_func is not None: stat = o_func(stat) if hi >= n_outputs - 1: # Note that again n_outputs - 1 is the eval val # followed by hallu stats. eval_vals.append(stat) if input_sess: input_sess.close() logger.info("evaluation_value={}".format(eval_vals)) return eval_vals
class ExpReplay(RNGDataFlow, Callback): """ Implement experience replay in the paper `Human-level control through deep reinforcement learning <http://www.nature.com/nature/journal/v518/n7540/full/nature14236.html>`_. This implementation provides the interface as a :class:`DataFlow`. This DataFlow is __not__ fork-safe (thus doesn't support multiprocess prefetching). This implementation assumes that state is batch-able, and the network takes batched inputs. """ def __init__(self, predictor_io_names, predictor_refine_io_names, env, state_shape, batch_size, memory_size, init_memory_size, init_exploration, update_frequency): """ Args: predictor_io_names (tuple of list of str): input/output names to predict Q value from state. player (RLEnvironment): the player. history_len (int): length of history frames to concat. Zero-filled initial frames. update_frequency (int): number of new transitions to add to memory after sampling a batch of transitions for training. """ init_memory_size = int(init_memory_size) items = locals().items() for k, v in items: if k != 'self': setattr(self, k, v) self.exploration = init_exploration self.env = env self.rng = get_rng(self) # print('RNG------------------------------------------', self.rng.randint(10)) self._init_memory_flag = threading.Event( ) # tell if memory has been initialized # a queue to receive notifications to populate memory self._populate_job_queue = queue.Queue(maxsize=5) self.mem = ReplayMemory(memory_size, state_shape) self.mem_refine = ReplayMemoryRefine(memory_size, state_shape) self.env.reset() self._current_ob, self._current_history = self.env.focus_image, self.env.history # stage 1 ar actions self._action_space = self.env.action_space # stage 2 actions self._action_space_refine = self.env.action_space_refine logger.info( "Number of Legal actions: stage-1-ar {}, stage-2 {}".format( len(self._action_space), len(self._action_space_refine))) self._player_scores = StatCounter() self._current_game_score = StatCounter() self.state_shape = state_shape def get_simulator_thread(self): # spawn a separate thread to run policy def populate_job_func(): self._populate_job_queue.get() for _ in range(self.update_frequency): self._populate_exp() th = ShareSessionThread(LoopThread(populate_job_func, pausable=False)) th.name = "SimulatorThread" return th def _init_memory(self): logger.info("Populating replay memory with epsilon={} ...".format( self.exploration)) with get_tqdm(total=self.init_memory_size) as pbar: while len(self.mem) < self.init_memory_size: self._populate_exp() pbar.update() self._init_memory_flag.set() def _populate_exp(self): """ populate a transition by epsilon-greedy""" old_s, old_history = self._current_ob, self._current_history # forced termination if self.env.iou > 0.5: act = -1 else: if self.rng.rand() <= self.exploration: act = self.rng.choice(range(len(self._action_space))) else: q_values = self.predictor(old_s[None, ...], old_history.reshape(1, -1))[0][0] act = np.argmax(q_values) # stage 2 if self._action_space[act] != 'trigger': self.env.step(self._action_space[act]) refine_stop = False while not refine_stop: state_refine, history_refine = self.env.focus_image, self.env.history_refine if self.rng.rand() <= self.exploration: act_refine = self.rng.choice( range(len(self._action_space_refine))) else: q_values = self.predictor_refine( state_refine[None, ...], history_refine.reshape(1, -1))[0][0] act_refine = np.argmax(q_values) reward_refine, refine_stop = self.env.step_refine( self._action_space_refine[act_refine]) self.mem_refine.append( Experience(state_refine, act_refine, reward_refine, refine_stop, history_refine.reshape(-1))) reward, isOver = self.env.step_post() else: reward, isOver = self.env.step(self._action_space[act]) self._current_game_score.feed(reward) if isOver: # print('lord wins' if reward > 0 else 'farmer wins') # print(self._current_game_score.sum) self._player_scores.feed(self._current_game_score.sum) self.env.reset() self._current_game_score.reset() self._current_ob, self._current_history = self.env.focus_image, self.env.history self.mem.append( Experience(old_s, act, reward, isOver, old_history.reshape(-1))) def get_data(self): # wait for memory to be initialized self._init_memory_flag.wait() while True: idx = self.rng.randint(self._populate_job_queue.maxsize * self.update_frequency, len(self.mem) - 1, size=self.batch_size) batch_exp = [self.mem.sample(i) for i in idx] batch_exp_refine = [self.mem_refine.sample(i) for i in idx] yield self._process_batch(batch_exp) + self._process_batch( batch_exp_refine) self._populate_job_queue.put(1) def _process_batch(self, batch_exp): state = np.asarray([e[0] for e in batch_exp], dtype='float32') action = np.asarray([e[1] for e in batch_exp], dtype='int32') reward = np.asarray([e[2] for e in batch_exp], dtype='float32') isOver = np.asarray([e[3] for e in batch_exp], dtype='bool') history = np.asarray([e[4] for e in batch_exp], dtype='float32') return [state, action, reward, isOver, history] def _setup_graph(self): self.predictor = self.trainer.get_predictor(*self.predictor_io_names) self.predictor_refine = self.trainer.get_predictor( *self.predictor_refine_io_names) def _before_train(self): self._init_memory() self._simulator_th = self.get_simulator_thread() self._simulator_th.start() def _trigger(self): v = self._player_scores try: mean, max = v.average, v.max self.trainer.monitors.put_scalar('expreplay/mean_score', mean) self.trainer.monitors.put_scalar('expreplay/max_score', max) except Exception: logger.exception("Cannot log training scores.") v.reset()