class Evaluate(object): def __init__(self): self.action_size = Environment.get_action_size(flags.env_type, flags.env_name) self.objective_size = Environment.get_objective_size(flags.env_type, flags.env_name) self.global_network = UnrealModel(self.action_size, self.objective_size, -1, flags.use_lstm, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, 0.0, 0.0, "/cpu:0", for_display=True) self.environment = Environment.create_environment(flags.env_type, flags.env_name, env_args={'episode_schedule': flags.split, 'log_action_trace': flags.log_action_trace, 'max_states_per_scene': flags.episodes_per_scene, 'episodes_per_scene_test': flags.episodes_per_scene}) self.episode_reward = 0 def update(self, sess): self.process(sess) def is_done(self): return self.environment.is_all_scheduled_episodes_done() def choose_action(self, pi_values): return np.random.choice(range(len(pi_values)), p=pi_values) def process(self, sess): last_action = self.environment.last_action last_reward = np.clip(self.environment.last_reward, -1, 1) last_action_reward = ExperienceFrame.concat_action_and_reward(last_action, self.action_size, last_reward, self.environment.last_state) if not flags.use_pixel_change: pi_values, v_value = self.global_network.run_base_policy_and_value(sess, self.environment.last_state, last_action_reward) else: pi_values, v_value, pc_q = self.global_network.run_base_policy_value_pc_q(sess, self.environment.last_state, last_action_reward) action = self.choose_action(pi_values) state, reward, terminal, pixel_change = self.environment.process(action) self.episode_reward += reward if terminal: self.environment.reset() self.episode_reward = 0
class Display(object): def __init__(self, display_size): pygame.init() self.surface = pygame.display.set_mode(display_size, 0, 24) pygame.display.set_caption('UNREAL') self.action_size = Environment.get_action_size(flags.env_type, flags.env_name) self.objective_size = Environment.get_objective_size( flags.env_type, flags.env_name) self.global_network = UnrealModel(self.action_size, self.objective_size, -1, flags.use_lstm, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, 0.0, 0.0, "/cpu:0", for_display=True) self.environment = Environment.create_environment( flags.env_type, flags.env_name, env_args={ 'episode_schedule': flags.split, 'log_action_trace': flags.log_action_trace, 'max_states_per_scene': flags.episodes_per_scene, 'episodes_per_scene_test': flags.episodes_per_scene }) self.font = pygame.font.SysFont(None, 20) self.value_history = ValueHistory() self.state_history = StateHistory() self.episode_reward = 0 def update(self, sess): self.surface.fill(BLACK) self.process(sess) pygame.display.update() def choose_action(self, pi_values): return np.random.choice(range(len(pi_values)), p=pi_values) def scale_image(self, image, scale): return image.repeat(scale, axis=0).repeat(scale, axis=1) def draw_text(self, str, left, top, color=WHITE): text = self.font.render(str, True, color, BLACK) text_rect = text.get_rect() text_rect.left = left text_rect.top = top self.surface.blit(text, text_rect) def draw_center_text(self, str, center_x, top): text = self.font.render(str, True, WHITE, BLACK) text_rect = text.get_rect() text_rect.centerx = center_x text_rect.top = top self.surface.blit(text, text_rect) def show_pixel_change(self, pixel_change, left, top, rate, label): """ Show pixel change """ pixel_change_ = np.clip(pixel_change * 255.0 * rate, 0.0, 255.0) data = pixel_change_.astype(np.uint8) data = np.stack([data for _ in range(3)], axis=2) data = self.scale_image(data, 4) image = pygame.image.frombuffer(data, (20 * 4, 20 * 4), 'RGB') self.surface.blit(image, (left + 8 + 4, top + 8 + 4)) self.draw_center_text(label, left + 100 / 2, top + 100) def show_policy(self, pi): """ Show action probability. """ start_x = 10 y = 150 for i in range(len(pi)): width = pi[i] * 100 pygame.draw.rect(self.surface, WHITE, (start_x, y, width, 10)) y += 20 self.draw_center_text("PI", 50, y) def show_image(self, state): """ Show input image """ state_ = state * 255.0 data = state_.astype(np.uint8) image = pygame.image.frombuffer(data, (84, 84), 'RGB') self.surface.blit(image, (8, 8)) self.draw_center_text("input", 50, 100) def show_value(self): if self.value_history.is_empty: return min_v = float("inf") max_v = float("-inf") values = self.value_history.values for v in values: min_v = min(min_v, v) max_v = max(max_v, v) top = 150 left = 150 width = 100 height = 100 bottom = top + width right = left + height d = max_v - min_v last_r = 0.0 for i, v in enumerate(values): r = (v - min_v) / d if i > 0: x0 = i - 1 + left x1 = i + left y0 = bottom - last_r * height y1 = bottom - r * height pygame.draw.line(self.surface, BLUE, (x0, y0), (x1, y1), 1) last_r = r pygame.draw.line(self.surface, WHITE, (left, top), (left, bottom), 1) pygame.draw.line(self.surface, WHITE, (right, top), (right, bottom), 1) pygame.draw.line(self.surface, WHITE, (left, top), (right, top), 1) pygame.draw.line(self.surface, WHITE, (left, bottom), (right, bottom), 1) self.draw_center_text("V", left + width / 2, bottom + 10) def show_reward_prediction(self, rp_c, reward): start_x = 310 reward_index = 0 if reward == 0: reward_index = 0 elif reward > 0: reward_index = 1 elif reward < 0: reward_index = 2 y = 150 labels = ["0", "+", "-"] for i in range(len(rp_c)): width = rp_c[i] * 100 if i == reward_index: color = RED else: color = WHITE pygame.draw.rect(self.surface, color, (start_x + 15, y, width, 10)) self.draw_text(labels[i], start_x, y - 1, color) y += 20 self.draw_center_text("RP", start_x + 100 / 2, y) def show_reward(self): self.draw_text("REWARD: {}".format(int(self.episode_reward)), 310, 10) def process(self, sess): last_action = self.environment.last_action last_reward = np.clip(self.environment.last_reward, -1, 1) last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward, self.environment.last_state) if not flags.use_pixel_change: pi_values, v_value = self.global_network.run_base_policy_and_value( sess, self.environment.last_state, last_action_reward) else: pi_values, v_value, pc_q = self.global_network.run_base_policy_value_pc_q( sess, self.environment.last_state, last_action_reward) self.value_history.add_value(v_value) action = self.choose_action(pi_values) state, reward, terminal, pixel_change = self.environment.process( action) self.episode_reward += reward if terminal: self.environment.reset() self.episode_reward = 0 self.show_image(state['image']) self.show_policy(pi_values) self.show_value() self.show_reward() if flags.use_pixel_change: self.show_pixel_change(pixel_change, 100, 0, 3.0, "PC") self.show_pixel_change(pc_q[:, :, action], 200, 0, 0.4, "PC Q") if flags.use_reward_prediction: if self.state_history.is_full: rp_c = self.global_network.run_rp_c(sess, self.state_history.states) self.show_reward_prediction(rp_c, reward) self.state_history.add_state(state) def get_frame(self): data = self.surface.get_buffer().raw return data
class Display(object): def __init__(self, display_size): pygame.init() self.surface = pygame.display.set_mode(display_size, 0, 24) name = 'UNREAL' if flags.segnet == 0 else "A3C ErfNet" pygame.display.set_caption(name) env_config = sim_config.get(flags.env_name) self.image_shape = [ env_config.get('height', 88), env_config.get('width', 88) ] segnet_param_dict = {'segnet_mode': flags.segnet} is_training = tf.placeholder(tf.bool, name="training") map_file = env_config.get('objecttypes_file', '../../objectTypes.csv') self.label_mapping = pd.read_csv(map_file, sep=',', header=0) self.get_col_index() self.action_size = Environment.get_action_size(flags.env_type, flags.env_name) self.objective_size = Environment.get_objective_size( flags.env_type, flags.env_name) self.global_network = UnrealModel(self.action_size, self.objective_size, -1, flags.use_lstm, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, 0.0, 0.0, "/gpu:0", segnet_param_dict=segnet_param_dict, image_shape=self.image_shape, is_training=is_training, n_classes=flags.n_classes, segnet_lambda=flags.segnet_lambda, dropout=flags.dropout, for_display=True) self.environment = Environment.create_environment( flags.env_type, flags.env_name, flags.termination_time_sec, env_args={ 'episode_schedule': flags.split, 'log_action_trace': flags.log_action_trace, 'max_states_per_scene': flags.episodes_per_scene, 'episodes_per_scene_test': flags.episodes_per_scene }) self.font = pygame.font.SysFont(None, 20) self.value_history = ValueHistory() self.state_history = StateHistory() self.episode_reward = 0 def update(self, sess): self.surface.fill(BLACK) self.process(sess) pygame.display.update() def choose_action(self, pi_values): return np.random.choice(range(len(pi_values)), p=pi_values) def scale_image(self, image, scale): return image.repeat(scale, axis=0).repeat(scale, axis=1) def draw_text(self, str, left, top, color=WHITE): text = self.font.render(str, True, color, BLACK) text_rect = text.get_rect() text_rect.left = left text_rect.top = top self.surface.blit(text, text_rect) def draw_center_text(self, str, center_x, top): text = self.font.render(str, True, WHITE, BLACK) text_rect = text.get_rect() text_rect.centerx = center_x text_rect.top = top self.surface.blit(text, text_rect) def show_pixel_change(self, pixel_change, left, top, rate, label): """ Show pixel change """ if "PC" in label: pixel_change_ = np.clip(pixel_change * 255.0 * rate, 0.0, 255.0) data = pixel_change_.astype(np.uint8) data = np.stack([data for _ in range(3)], axis=2) data = self.scale_image(data, 4) #print("PC shape", data.shape) image = pygame.image.frombuffer(data, (20 * 4, 20 * 4), 'RGB') else: pixel_change = self.scale_image(pixel_change, 2) #print("Preds shape", pixel_change.shape) image = pygame.image.frombuffer(pixel_change.astype( np.uint8), (self.image_shape[0] * 2, self.image_shape[1] * 2), 'RGB') self.surface.blit(image, (2 * left + 16 + 8, 2 * top + 16 + 8)) self.draw_center_text(label, 2 * left + 200 / 2, 2 * top + 200) def show_policy(self, pi): """ Show action probability. """ start_x = 10 y = 150 for i in range(len(pi)): width = pi[i] * 100 pygame.draw.rect(self.surface, WHITE, (2 * start_x, 2 * y, 2 * width, 2 * 10)) y += 20 self.draw_center_text("PI", 2 * 50, 2 * y) def show_image(self, state): """ Show input image """ state_ = state * 255.0 data = state_.astype(np.uint8) data = self.scale_image(data, 2) image = pygame.image.frombuffer( data, (self.image_shape[0] * 2, self.image_shape[1] * 2), 'RGB') self.surface.blit(image, (8 * 2, 8 * 2)) self.draw_center_text("input", 2 * 50, 2 * 100) def show_value(self): if self.value_history.is_empty: return min_v = float("inf") max_v = float("-inf") values = self.value_history.values for v in values: min_v = min(min_v, v) max_v = max(max_v, v) top = 150 * 2 left = 150 * 2 width = 100 * 2 height = 100 * 2 bottom = top + width right = left + height d = max_v - min_v last_r = 0.0 for i, v in enumerate(values): r = (v - min_v) / d if i > 0: x0 = i - 1 + left x1 = i + left y0 = bottom - last_r * height y1 = bottom - r * height pygame.draw.line(self.surface, BLUE, (x0, y0), (x1, y1), 1) last_r = r pygame.draw.line(self.surface, WHITE, (left, top), (left, bottom), 1) pygame.draw.line(self.surface, WHITE, (right, top), (right, bottom), 1) pygame.draw.line(self.surface, WHITE, (left, top), (right, top), 1) pygame.draw.line(self.surface, WHITE, (left, bottom), (right, bottom), 1) self.draw_center_text("V", left + width / 2, bottom + 10) def show_reward_prediction(self, rp_c, reward): start_x = 310 reward_index = 0 if reward == 0: reward_index = 0 elif reward > 0: reward_index = 1 elif reward < 0: reward_index = 2 y = 150 labels = ["0", "+", "-"] for i in range(len(rp_c)): width = rp_c[i] * 100 if i == reward_index: color = RED else: color = WHITE pygame.draw.rect(self.surface, color, (2 * start_x + 2 * 15, 2 * y, 2 * width, 2 * 10)) self.draw_text(labels[i], 2 * start_x, 2 * y - 2 * 1, color) y += 20 self.draw_center_text("RP", 2 * start_x + 2 * 100 / 2, y) def show_reward(self): self.draw_text("REWARD: {:.4}".format(float(self.episode_reward)), 300, 2 * 10) def process(self, sess): sess.run([ tf.global_variables_initializer(), tf.local_variables_initializer() ]) #sess.run(tf.initialize_all_variables()) last_action = self.environment.last_action last_reward = self.environment.last_reward last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward, self.environment.last_state) preds = None mode = "segnet" if flags.segnet >= 2 else "" mode = "" #don't want preds if not flags.use_pixel_change: pi_values, v_value, preds = self.global_network.run_base_policy_and_value( sess, self.environment.last_state, last_action_reward, mode=mode) else: pi_values, v_value, pc_q = self.global_network.run_base_policy_value_pc_q( sess, self.environment.last_state, last_action_reward) #print(preds) self.value_history.add_value(v_value) prev_state = self.environment.last_state action = self.choose_action(pi_values) state, reward, terminal, pixel_change = self.environment.process( action) self.episode_reward += reward if terminal: self.environment.reset() self.episode_reward = 0 self.show_image(state['image']) self.show_policy(pi_values) self.show_value() self.show_reward() if not flags.use_pixel_change: if preds is not None: self.show_pixel_change(self.label_to_rgb(preds), 100, 0, 3.0, "Preds") self.show_pixel_change(self.label_to_rgb(state['objectType']), 200, 0, 0.4, "Segm Mask") else: self.show_pixel_change(pixel_change, 100, 0, 3.0, "PC") self.show_pixel_change(pc_q[:, :, action], 200, 0, 0.4, "PC Q") if flags.use_reward_prediction: if self.state_history.is_full: rp_c = self.global_network.run_rp_c(sess, self.state_history.states) self.show_reward_prediction(rp_c, reward) self.state_history.add_state(state) def get_frame(self): data = self.surface.get_buffer().raw return data def get_col_index(self): ind_col = self.label_mapping[["index", "color"]].values index = ind_col[:, 0].astype(np.int) self.index, ind = np.unique(index, return_index=True) self.col = np.array([[int(x) for x in col.split('_')] for col in ind_col[ind, 1]]) def label_to_rgb(self, labels): #print(self.col) rgb_img = self.col[np.where(self.index[np.newaxis, :] == labels.ravel( )[:, np.newaxis])[1]].reshape(labels.shape + (3, )) return rgb_img
class Tester(object): def __init__(self): self.img = np.zeros(shape=(HEIGHT, WIDTH, 3), dtype=np.uint8) self.action_size = Environment.get_action_size() self.global_network = UnrealModel(self.action_size, -1, "/cpu:0", for_display=True) self.env = Environment.create_environment() self.value_history = ValueHistory() self.state_history = StateHistory() self.ep_reward = 0 self.mazemap = MazeMap() def process(self, sess): self.img = np.zeros(shape=(HEIGHT, WIDTH, 3), dtype=np.uint8) last_action = self.env.last_action last_reward = np.clip(self.env.last_reward, -1, 1) last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward) if not USE_PIXEL_CHANGE: pi_values, v_value = self.global_network.run_base_policy_and_value( sess, self.env.last_state, last_action_reward) else: pi_values, v_value, pc_q = self.global_network.run_base_policy_value_pc_q( sess, self.env.last_state, last_action_reward) self.value_history.add_value(v_value) action = self.choose_action(pi_values) state, reward, terminal, pc, vtrans, vrot = self.env.process(action) self.state_history.add_state(state) self.ep_reward += reward self.mazemap.update(vtrans, vrot) if reward > 9: # agent到达迷宫终点时,reward为10,地图需要重置 self.mazemap.reset() if terminal: # lab环境默认3600帧为一个episode而不是到达迷宫终点时给terminal信号 self.env.reset() self.ep_reward = 0 self.mazemap.reset() self.show_ob(state, 3, 3, "Observation") self.show_pc(pc, 100, 3, 3.0, "Pixel Change") self.show_pc(pc_q[:, :, action], 200, 3, 0.4, "PC Q") self.show_map(300, 3, "Maze Map") self.show_pi(pi_values) self.show_reward() self.show_rp() self.show_value() def choose_action(self, pi_values): return np.random.choice(range(len(pi_values)), p=pi_values) def scale_image(self, image, scale): return image.repeat(scale, axis=0).repeat(scale, axis=1) def draw_text(self, text, left, bottom, color=WHITE): font = cv2.FONT_HERSHEY_COMPLEX cv2.putText(self.img, text, (left, bottom), font, 0.35, color) def show_pc(self, pc, left, top, rate, label): pc = np.clip(pc * 255.0 * rate, 0.0, 255.0) data = pc.astype(np.uint8) data = np.stack([data for _ in range(3)], axis=2) data = self.scale_image(data, 4) h = data.shape[0] w = data.shape[1] self.img[top:top + h, left:left + w, :] = data self.draw_text(label, (left + 2), (top + h + 15)) def show_map(self, left, top, label): maze = self.mazemap.get_map(84, 84) maze = (maze * 255).astype(np.uint8) h = maze.shape[0] w = maze.shape[1] self.img[top:top + h, left:left + w, :] = maze self.draw_text(label, (left + 2), (top + h + 5)) def show_pi(self, pi): for i in range(len(pi)): width = int(pi[i] * 100) cv2.rectangle(self.img, (3, 113 + 15 * i), (width, 120 + 15 * i), WHITE) self.draw_text("Policy", 20, 120 + 15 * len(pi)) def show_ob(self, state, left, top, label): state = (state * 255.0).astype(np.uint8) h = state.shape[0] w = state.shape[1] self.img[top:top + h, left:left + w, :] = state self.draw_text(label, (left + 2), (top + h + 15)) def show_value(self, left, top, height, width): if self.value_history.is_empty: return min_v = float("inf") max_v = float("-inf") values = self.value_history.values for v in values: min_v = min(min_v, v) max_v = max(max_v, v) bottom = top + height right = left + width d = max_v - min_v last_r = 0.0 for i, v in enumerate(values): r = (v - min_v) / d if i > 0: x0 = i - 1 + left x1 = i + left y0 = bottom - last_r * height y1 = bottom - r * height cv2.line(self.img, (y0, x0), (y1, x1), BLUE, 2) last_r = r cv2.line(self.img, (top, left), (bottom, left), WHITE, 1) cv2.line(self.img, (top, right), (bottom, right), WHITE, 1) cv2.line(self.img, (top, left), (top, right), WHITE, 1) cv2.line(self.img, (bottom, left), (bottom, right), WHITE, 1) self.draw_text("Q Value", 120, 215) def show_rp(self): pass def show_reward(self): self.draw_text("Reward: {}".format(int(self.ep_reward)), 10, 230) def get_frame(self): return self.img
class Evaluate(object): def __init__(self): self.action_size = Environment.get_action_size(flags.env_type, flags.env_name) self.objective_size = Environment.get_objective_size(flags.env_type, flags.env_name) env_config = sim_config.get(flags.env_name) self.image_shape = [env_config['height'], env_config['width']] segnet_param_dict = {'segnet_mode': flags.segnet} is_training = tf.placeholder(tf.bool, name="training") # for display param in UnrealModel says its value self.global_network = UnrealModel(self.action_size, self.objective_size, -1, flags.use_lstm, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, 0.0, #flags.pixel_change_lambda 0.0, #flags.entropy_beta device, segnet_param_dict=segnet_param_dict, image_shape=self.image_shape, is_training=is_training, n_classes=flags.n_classes, segnet_lambda=flags.segnet_lambda, dropout=flags.dropout, for_display=True) self.environment = Environment.create_environment(flags.env_type, flags.env_name, flags.termination_time_sec, env_args={'episode_schedule': flags.split, 'log_action_trace': flags.log_action_trace, 'max_states_per_scene': flags.episodes_per_scene, 'episodes_per_scene_test': flags.episodes_per_scene}) self.global_network.prepare_loss() self.total_loss = [] self.segm_loss = [] self.episode_reward = [0] self.episode_roomtype = [] self.roomType_dict = {} self.segnet_class_dict = {} self.success_rate = [] self.batch_size = 20 self.batch_cur_num = 0 self.batch_prev_num = 0 self.batch_si = [] self.batch_sobjT = [] self.batch_a = [] self.batch_reward = [] def update(self, sess): self.process(sess) def is_done(self): return self.environment.is_all_scheduled_episodes_done() def choose_action(self, pi_values): return np.random.choice(range(len(pi_values)), p=pi_values) def process(self, sess): last_action = self.environment.last_action last_reward = self.environment.last_reward last_action_reward = ExperienceFrame.concat_action_and_reward(last_action, self.action_size, last_reward, self.environment.last_state) if random_policy: pi_values = [1/3.0, 1/3.0, 1/3.0] action = self.choose_action(pi_values) state, reward, terminal, pixel_change = self.environment.process(action) self.episode_reward[-1] += reward else: mode = "segnet" if flags.segnet >= 2 else "" segnet_preds = None if not flags.use_pixel_change: pi_values, v_value, segnet_preds = self.global_network.run_base_policy_and_value(sess, self.environment.last_state, last_action_reward, mode=mode) else: pi_values, v_value, pc_q = self.global_network.run_base_policy_value_pc_q(sess, self.environment.last_state, last_action_reward) if segnet_preds is not None: mask = self.environment.last_state.get('objectType', None) if mask is not None: new_classes = np.unique(mask) if segnet_preds.shape != mask.shape: print("Predictions have shape {}, but groundtruth mask has shape {}".format(segnet_preds.shape, mask.shape)) else: similar = segnet_preds == mask for id_class in new_classes: id_list = self.segnet_class_dict.get(id_class, None) if id_list is None: id_list = [] id_list += [[np.sum(similar[mask == id_class]), np.sum(mask == id_class)]] self.segnet_class_dict[id_class] = id_list self.batch_cur_num += 1 if flags.segnet == -1: #just not necessary if self.batch_cur_num != 0 and self.batch_cur_num - self.batch_prev_num >= self.batch_size: #print(np.unique(self.batch_sobjT)) feed_dict = {self.global_network.base_input: self.batch_si, self.global_network.base_segm_mask: self.batch_sobjT, self.global_network.is_training: not True} segm_loss, preds, confusion_mtx = sess.run([self.global_network.decoder_loss, self.global_network.preds, self.global_network.update_evaluation_vars], feed_dict=feed_dict) total_loss = 0 self.total_loss += [total_loss] self.segm_loss += [segm_loss] # TODO: here do something with it, store somwhere? #update every_thing else self.batch_prev_num = self.batch_cur_num self.batch_si = [] self.batch_sobjT = [] self.batch_a = [] else: self.batch_si += [self.environment.last_state["image"]] self.batch_sobjT += [self.environment.last_state["objectType"]] self.batch_a += [self.environment.ACTION_LIST[self.environment.last_action]] action = self.choose_action(pi_values) state, reward, terminal, pixel_change = self.environment.process(action) self.episode_reward[-1] += reward if terminal: ep_info = self.environment._episode_info if ep_info['task'] == 'room_goal': one_hot_room = ep_info['goal']['roomTypeEncoded'] room_type = ep_info['goal']['roomType'] ind = np.where(one_hot_room)[0][0] self.roomType_dict[ind] = room_type self.episode_roomtype += [ind] self.success_rate += [int(self.environment._last_full_state["success"])] self.environment.reset() self.episode_reward += [0]