def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, env_type, env_name, use_lstm, use_pixel_change, use_value_replay, use_reward_prediction, pixel_change_lambda, entropy_beta, local_t_max, gamma, gamma_pc, experience_history_size, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.env_type = env_type self.env_name = env_name self.use_lstm = use_lstm self.use_pixel_change = use_pixel_change self.use_value_replay = use_value_replay self.use_reward_prediction = use_reward_prediction self.local_t_max = local_t_max self.gamma = gamma self.gamma_pc = gamma_pc self.experience_history_size = experience_history_size self.max_global_time_step = max_global_time_step self.action_size = Environment.get_action_size(env_type, env_name) self.objective_size = Environment.get_objective_size(env_type, env_name) self.local_network = UnrealModel(self.action_size, self.objective_size, thread_index, use_lstm, use_pixel_change, use_value_replay, use_reward_prediction, pixel_change_lambda, entropy_beta, device) self.local_network.prepare_loss() self.apply_gradients = grad_applier.minimize_local(self.local_network.total_loss, global_network.get_vars(), self.local_network.get_vars()) self.sync = self.local_network.sync_from(global_network) self.experience = Experience(self.experience_history_size) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # For log output self.prev_local_t = 0
def __init__(self): self.action_size = Environment.get_action_size(flags.env_type, flags.env_name) self.objective_size = Environment.get_objective_size( flags.env_type, flags.env_name) self.global_network = UnrealModel(self.action_size, self.objective_size, -1, flags.use_lstm, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, 0.0, 0.0, "/cpu:0", for_display=True) self.environment = Environment.create_environment( flags.env_type, flags.env_name, env_args={ 'episode_schedule': flags.split, 'log_action_trace': flags.log_action_trace, 'seed': flags.seed, # 'max_states_per_scene': flags.episodes_per_scene, 'episodes_per_scene_test': flags.episodes_per_scene }) self.episode_reward = 0 self.cnt_success = 0
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.action_size = Environment.get_action_size() self.local_network = UnrealModel(self.action_size, thread_index, device) self.local_network.prepare_loss() self.apply_gradients = grad_applier.minimize_local(self.local_network.total_loss, global_network.get_vars(), self.local_network.get_vars()) self.sync = self.local_network.sync_from(global_network) self.environment = Environment.create_environment() self.experience = Experience(EXPERIENCE_HISTORY_SIZE) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # For log output self.prev_local_t = 0
def __init__(self): self.action_size = Environment.get_action_size(flags.env_type, flags.env_name) self.objective_size = Environment.get_objective_size( flags.env_type, flags.env_name) print('flags:use_pixel_change {}'.format(flags.use_pixel_change)) sleep(10) self.global_network = UnrealModel(self.action_size, self.objective_size, -1, flags.use_lstm, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, 0.0, 0.0, "/cpu:0", for_display=True) self.environment = Environment.create_environment( flags.env_type, flags.env_name, env_args={ 'episode_schedule': flags.split, 'log_action_trace': flags.log_action_trace, 'max_states_per_scene': flags.episodes_per_scene, 'episodes_per_scene_test': flags.episodes_per_scene }) print('\n======\nENV in Evaluate::ctor') print(self.environment) print(self.global_network) print('val_replay!!! {}'.format(flags.use_value_replay)) print(flags.split) print('=======\n') sleep(10) self.episode_reward = 0
def __init__(self, display_size): pygame.init() self.surface = pygame.display.set_mode(display_size, 0, 24) pygame.display.set_caption('UNREAL') self.action_size = Environment.get_action_size(flags.env_type, flags.env_name) self.objective_size = Environment.get_objective_size( flags.env_type, flags.env_name) self.global_network = UnrealModel(self.action_size, self.objective_size, -1, flags.use_lstm, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, 0.0, 0.0, "/cpu:0", for_display=True) self.environment = Environment.create_environment( flags.env_type, flags.env_name, env_args={ 'episode_schedule': flags.split, 'log_action_trace': flags.log_action_trace, 'max_states_per_scene': flags.episodes_per_scene, 'episodes_per_scene_test': flags.episodes_per_scene }) self.font = pygame.font.SysFont(None, 20) self.value_history = ValueHistory() self.state_history = StateHistory() self.episode_reward = 0
def __init__(self, display_size): pygame.init() self.surface = pygame.display.set_mode(display_size, 0, 24) name = 'UNREAL' if flags.segnet == 0 else "A3C ErfNet" pygame.display.set_caption(name) env_config = sim_config.get(flags.env_name) self.image_shape = [ env_config.get('height', 88), env_config.get('width', 88) ] segnet_param_dict = {'segnet_mode': flags.segnet} is_training = tf.placeholder(tf.bool, name="training") map_file = env_config.get('objecttypes_file', '../../objectTypes.csv') self.label_mapping = pd.read_csv(map_file, sep=',', header=0) self.get_col_index() self.action_size = Environment.get_action_size(flags.env_type, flags.env_name) self.objective_size = Environment.get_objective_size( flags.env_type, flags.env_name) self.global_network = UnrealModel(self.action_size, self.objective_size, -1, flags.use_lstm, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, 0.0, 0.0, "/gpu:0", segnet_param_dict=segnet_param_dict, image_shape=self.image_shape, is_training=is_training, n_classes=flags.n_classes, segnet_lambda=flags.segnet_lambda, dropout=flags.dropout, for_display=True) self.environment = Environment.create_environment( flags.env_type, flags.env_name, flags.termination_time_sec, env_args={ 'episode_schedule': flags.split, 'log_action_trace': flags.log_action_trace, 'max_states_per_scene': flags.episodes_per_scene, 'episodes_per_scene_test': flags.episodes_per_scene }) self.font = pygame.font.SysFont(None, 20) self.value_history = ValueHistory() self.state_history = StateHistory() self.episode_reward = 0
def __init__(self): self.img = np.zeros(shape=(HEIGHT, WIDTH, 3), dtype=np.uint8) self.action_size = Environment.get_action_size() self.global_network = UnrealModel(self.action_size, -1, "/cpu:0", for_display=True) self.env = Environment.create_environment() self.value_history = ValueHistory() self.state_history = StateHistory() self.ep_reward = 0 self.mazemap = MazeMap()
class Evaluate(object): def __init__(self): self.action_size = Environment.get_action_size(flags.env_type, flags.env_name) self.objective_size = Environment.get_objective_size(flags.env_type, flags.env_name) self.global_network = UnrealModel(self.action_size, self.objective_size, -1, flags.use_lstm, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, 0.0, 0.0, "/cpu:0", for_display=True) self.environment = Environment.create_environment(flags.env_type, flags.env_name, env_args={'episode_schedule': flags.split, 'log_action_trace': flags.log_action_trace, 'max_states_per_scene': flags.episodes_per_scene, 'episodes_per_scene_test': flags.episodes_per_scene}) self.episode_reward = 0 def update(self, sess): self.process(sess) def is_done(self): return self.environment.is_all_scheduled_episodes_done() def choose_action(self, pi_values): return np.random.choice(range(len(pi_values)), p=pi_values) def process(self, sess): last_action = self.environment.last_action last_reward = np.clip(self.environment.last_reward, -1, 1) last_action_reward = ExperienceFrame.concat_action_and_reward(last_action, self.action_size, last_reward, self.environment.last_state) if not flags.use_pixel_change: pi_values, v_value = self.global_network.run_base_policy_and_value(sess, self.environment.last_state, last_action_reward) else: pi_values, v_value, pc_q = self.global_network.run_base_policy_value_pc_q(sess, self.environment.last_state, last_action_reward) action = self.choose_action(pi_values) state, reward, terminal, pixel_change = self.environment.process(action) self.episode_reward += reward if terminal: self.environment.reset() self.episode_reward = 0
def __init__(self): self.action_size = Environment.get_action_size(flags.env_type, flags.env_name) self.objective_size = Environment.get_objective_size(flags.env_type, flags.env_name) env_config = sim_config.get(flags.env_name) self.image_shape = [env_config['height'], env_config['width']] segnet_param_dict = {'segnet_mode': flags.segnet} is_training = tf.placeholder(tf.bool, name="training") # for display param in UnrealModel says its value self.global_network = UnrealModel(self.action_size, self.objective_size, -1, flags.use_lstm, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, 0.0, #flags.pixel_change_lambda 0.0, #flags.entropy_beta device, segnet_param_dict=segnet_param_dict, image_shape=self.image_shape, is_training=is_training, n_classes=flags.n_classes, segnet_lambda=flags.segnet_lambda, dropout=flags.dropout, for_display=True) self.environment = Environment.create_environment(flags.env_type, flags.env_name, flags.termination_time_sec, env_args={'episode_schedule': flags.split, 'log_action_trace': flags.log_action_trace, 'max_states_per_scene': flags.episodes_per_scene, 'episodes_per_scene_test': flags.episodes_per_scene}) self.global_network.prepare_loss() self.total_loss = [] self.segm_loss = [] self.episode_reward = [0] self.episode_roomtype = [] self.roomType_dict = {} self.segnet_class_dict = {} self.success_rate = [] self.batch_size = 20 self.batch_cur_num = 0 self.batch_prev_num = 0 self.batch_si = [] self.batch_sobjT = [] self.batch_a = [] self.batch_reward = []
def main(args): action_size = Environment.get_action_size(flags.env_type, flags.env_name) objective_size = Environment.get_objective_size(flags.env_type, flags.env_name) global_network = UnrealModel(action_size, objective_size, -1, flags.use_lstm, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, 0.0, 0.0, "/cpu:0") # use CPU for weight visualize tool sess = tf.Session() init = tf.global_variables_initializer() sess.run(init) saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(flags.checkpoint_dir) if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old checkpoint") vars = {} var_list = global_network.get_vars() for v in var_list: vars[v.name] = v W_conv1 = sess.run(vars['net_-1/base_conv/W_base_conv1:0']) # show graph of W_conv1 fig, axes = plt.subplots(3, 16, figsize=(12, 6), subplot_kw={ 'xticks': [], 'yticks': [] }) fig.subplots_adjust(hspace=0.1, wspace=0.1) for ax, i in zip(axes.flat, range(3 * 16)): inch = i // 16 outch = i % 16 img = W_conv1[:, :, inch, outch] ax.imshow(img, cmap=plt.cm.gray, interpolation='nearest') ax.set_title(str(inch) + "," + str(outch)) plt.show()
def __init__(self, display_size): pygame.init() self.surface = pygame.display.set_mode(display_size, 0, 24) pygame.display.set_caption('UNREAL') self.action_size = Environment.get_action_size() self.global_network = UnrealModel(self.action_size, -1, "/cpu:0", for_display=True) self.environment = Environment.create_environment() self.font = pygame.font.SysFont(None, 20) self.value_history = ValueHistory() self.state_history = StateHistory() self.episode_reward = 0
def get_prediction(history, action, env_name, check_dir): action_size = Environment.get_action_size(env_type, env_name) global_network = UnrealModel( action_size, -1, #flags.use_pixel_change, #flags.use_value_replay, #flags.use_reward_prediction, #flags.use_future_reward_prediction, #flags.use_autoencoder, False, False, False, True, True, .0, .0, "/cpu:0") config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) config.gpu_options.allow_growth = True sess = tf.Session(config=config) init = tf.global_variables_initializer() sess.run(init) saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(check_dir) if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old checkpoint") feed_dict = { global_network.frp_input: np.zeros((4, 84, 84, 3)), global_network.frp_action_input: np.zeros((1, action_size)) } encoder_output = sess.run(global_network.encoder_output, feed_dict) print(encoder_output)
class Display(object): def __init__(self, display_size): pygame.init() self.surface = pygame.display.set_mode(display_size, 0, 24) name = 'UNREAL' if flags.segnet == 0 else "A3C ErfNet" pygame.display.set_caption(name) env_config = sim_config.get(flags.env_name) self.image_shape = [ env_config.get('height', 88), env_config.get('width', 88) ] segnet_param_dict = {'segnet_mode': flags.segnet} is_training = tf.placeholder(tf.bool, name="training") map_file = env_config.get('objecttypes_file', '../../objectTypes.csv') self.label_mapping = pd.read_csv(map_file, sep=',', header=0) self.get_col_index() self.action_size = Environment.get_action_size(flags.env_type, flags.env_name) self.objective_size = Environment.get_objective_size( flags.env_type, flags.env_name) self.global_network = UnrealModel(self.action_size, self.objective_size, -1, flags.use_lstm, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, 0.0, 0.0, "/gpu:0", segnet_param_dict=segnet_param_dict, image_shape=self.image_shape, is_training=is_training, n_classes=flags.n_classes, segnet_lambda=flags.segnet_lambda, dropout=flags.dropout, for_display=True) self.environment = Environment.create_environment( flags.env_type, flags.env_name, flags.termination_time_sec, env_args={ 'episode_schedule': flags.split, 'log_action_trace': flags.log_action_trace, 'max_states_per_scene': flags.episodes_per_scene, 'episodes_per_scene_test': flags.episodes_per_scene }) self.font = pygame.font.SysFont(None, 20) self.value_history = ValueHistory() self.state_history = StateHistory() self.episode_reward = 0 def update(self, sess): self.surface.fill(BLACK) self.process(sess) pygame.display.update() def choose_action(self, pi_values): return np.random.choice(range(len(pi_values)), p=pi_values) def scale_image(self, image, scale): return image.repeat(scale, axis=0).repeat(scale, axis=1) def draw_text(self, str, left, top, color=WHITE): text = self.font.render(str, True, color, BLACK) text_rect = text.get_rect() text_rect.left = left text_rect.top = top self.surface.blit(text, text_rect) def draw_center_text(self, str, center_x, top): text = self.font.render(str, True, WHITE, BLACK) text_rect = text.get_rect() text_rect.centerx = center_x text_rect.top = top self.surface.blit(text, text_rect) def show_pixel_change(self, pixel_change, left, top, rate, label): """ Show pixel change """ if "PC" in label: pixel_change_ = np.clip(pixel_change * 255.0 * rate, 0.0, 255.0) data = pixel_change_.astype(np.uint8) data = np.stack([data for _ in range(3)], axis=2) data = self.scale_image(data, 4) #print("PC shape", data.shape) image = pygame.image.frombuffer(data, (20 * 4, 20 * 4), 'RGB') else: pixel_change = self.scale_image(pixel_change, 2) #print("Preds shape", pixel_change.shape) image = pygame.image.frombuffer(pixel_change.astype( np.uint8), (self.image_shape[0] * 2, self.image_shape[1] * 2), 'RGB') self.surface.blit(image, (2 * left + 16 + 8, 2 * top + 16 + 8)) self.draw_center_text(label, 2 * left + 200 / 2, 2 * top + 200) def show_policy(self, pi): """ Show action probability. """ start_x = 10 y = 150 for i in range(len(pi)): width = pi[i] * 100 pygame.draw.rect(self.surface, WHITE, (2 * start_x, 2 * y, 2 * width, 2 * 10)) y += 20 self.draw_center_text("PI", 2 * 50, 2 * y) def show_image(self, state): """ Show input image """ state_ = state * 255.0 data = state_.astype(np.uint8) data = self.scale_image(data, 2) image = pygame.image.frombuffer( data, (self.image_shape[0] * 2, self.image_shape[1] * 2), 'RGB') self.surface.blit(image, (8 * 2, 8 * 2)) self.draw_center_text("input", 2 * 50, 2 * 100) def show_value(self): if self.value_history.is_empty: return min_v = float("inf") max_v = float("-inf") values = self.value_history.values for v in values: min_v = min(min_v, v) max_v = max(max_v, v) top = 150 * 2 left = 150 * 2 width = 100 * 2 height = 100 * 2 bottom = top + width right = left + height d = max_v - min_v last_r = 0.0 for i, v in enumerate(values): r = (v - min_v) / d if i > 0: x0 = i - 1 + left x1 = i + left y0 = bottom - last_r * height y1 = bottom - r * height pygame.draw.line(self.surface, BLUE, (x0, y0), (x1, y1), 1) last_r = r pygame.draw.line(self.surface, WHITE, (left, top), (left, bottom), 1) pygame.draw.line(self.surface, WHITE, (right, top), (right, bottom), 1) pygame.draw.line(self.surface, WHITE, (left, top), (right, top), 1) pygame.draw.line(self.surface, WHITE, (left, bottom), (right, bottom), 1) self.draw_center_text("V", left + width / 2, bottom + 10) def show_reward_prediction(self, rp_c, reward): start_x = 310 reward_index = 0 if reward == 0: reward_index = 0 elif reward > 0: reward_index = 1 elif reward < 0: reward_index = 2 y = 150 labels = ["0", "+", "-"] for i in range(len(rp_c)): width = rp_c[i] * 100 if i == reward_index: color = RED else: color = WHITE pygame.draw.rect(self.surface, color, (2 * start_x + 2 * 15, 2 * y, 2 * width, 2 * 10)) self.draw_text(labels[i], 2 * start_x, 2 * y - 2 * 1, color) y += 20 self.draw_center_text("RP", 2 * start_x + 2 * 100 / 2, y) def show_reward(self): self.draw_text("REWARD: {:.4}".format(float(self.episode_reward)), 300, 2 * 10) def process(self, sess): sess.run([ tf.global_variables_initializer(), tf.local_variables_initializer() ]) #sess.run(tf.initialize_all_variables()) last_action = self.environment.last_action last_reward = self.environment.last_reward last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward, self.environment.last_state) preds = None mode = "segnet" if flags.segnet >= 2 else "" mode = "" #don't want preds if not flags.use_pixel_change: pi_values, v_value, preds = self.global_network.run_base_policy_and_value( sess, self.environment.last_state, last_action_reward, mode=mode) else: pi_values, v_value, pc_q = self.global_network.run_base_policy_value_pc_q( sess, self.environment.last_state, last_action_reward) #print(preds) self.value_history.add_value(v_value) prev_state = self.environment.last_state action = self.choose_action(pi_values) state, reward, terminal, pixel_change = self.environment.process( action) self.episode_reward += reward if terminal: self.environment.reset() self.episode_reward = 0 self.show_image(state['image']) self.show_policy(pi_values) self.show_value() self.show_reward() if not flags.use_pixel_change: if preds is not None: self.show_pixel_change(self.label_to_rgb(preds), 100, 0, 3.0, "Preds") self.show_pixel_change(self.label_to_rgb(state['objectType']), 200, 0, 0.4, "Segm Mask") else: self.show_pixel_change(pixel_change, 100, 0, 3.0, "PC") self.show_pixel_change(pc_q[:, :, action], 200, 0, 0.4, "PC Q") if flags.use_reward_prediction: if self.state_history.is_full: rp_c = self.global_network.run_rp_c(sess, self.state_history.states) self.show_reward_prediction(rp_c, reward) self.state_history.add_state(state) def get_frame(self): data = self.surface.get_buffer().raw return data def get_col_index(self): ind_col = self.label_mapping[["index", "color"]].values index = ind_col[:, 0].astype(np.int) self.index, ind = np.unique(index, return_index=True) self.col = np.array([[int(x) for x in col.split('_')] for col in ind_col[ind, 1]]) def label_to_rgb(self, labels): #print(self.col) rgb_img = self.col[np.where(self.index[np.newaxis, :] == labels.ravel( )[:, np.newaxis])[1]].reshape(labels.shape + (3, )) return rgb_img
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, env_type, env_name, use_lstm, use_pixel_change, use_value_replay, use_reward_prediction, pixel_change_lambda, entropy_beta, local_t_max, n_step_TD, gamma, gamma_pc, experience_history_size, max_global_time_step, device, segnet_param_dict, image_shape, is_training, n_classes, random_state, termination_time, segnet_lambda, dropout): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.env_type = env_type self.env_name = env_name self.use_lstm = use_lstm self.use_pixel_change = use_pixel_change self.use_value_replay = use_value_replay self.use_reward_prediction = use_reward_prediction self.local_t_max = local_t_max self.n_step_TD = n_step_TD self.gamma = gamma self.gamma_pc = gamma_pc self.experience_history_size = experience_history_size self.max_global_time_step = max_global_time_step self.action_size = Environment.get_action_size(env_type, env_name) self.objective_size = Environment.get_objective_size( env_type, env_name) self.segnet_param_dict = segnet_param_dict self.segnet_mode = self.segnet_param_dict.get("segnet_mode", None) self.is_training = is_training self.n_classes = n_classes self.segnet_lambda = segnet_lambda self.run_metadata = tf.RunMetadata() self.many_runs_timeline = TimeLiner() self.random_state = random_state self.termination_time = termination_time self.dropout = dropout try: self.local_network = UnrealModel( self.action_size, self.objective_size, thread_index, use_lstm, use_pixel_change, use_value_replay, use_reward_prediction, pixel_change_lambda, entropy_beta, device, segnet_param_dict=self.segnet_param_dict, image_shape=image_shape, is_training=is_training, n_classes=n_classes, segnet_lambda=self.segnet_lambda, dropout=dropout) self.local_network.prepare_loss() self.apply_gradients = grad_applier.minimize_local( self.local_network.total_loss, global_network.get_vars(), self.local_network.get_vars(), self.thread_index) self.sync = self.local_network.sync_from(global_network) self.experience = Experience(self.experience_history_size, random_state=self.random_state) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # For log output self.prev_local_t = -1 self.prev_local_t_loss = 0 self.sr_size = 50 self.success_rates = deque(maxlen=self.sr_size) except Exception as e: print(str(e)) #, flush=True) raise Exception( "Problem in Trainer {} initialization".format(thread_index))
class Trainer(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, env_type, env_name, use_lstm, use_pixel_change, use_value_replay, use_reward_prediction, pixel_change_lambda, entropy_beta, local_t_max, n_step_TD, gamma, gamma_pc, experience_history_size, max_global_time_step, device, segnet_param_dict, image_shape, is_training, n_classes, random_state, termination_time, segnet_lambda, dropout): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.env_type = env_type self.env_name = env_name self.use_lstm = use_lstm self.use_pixel_change = use_pixel_change self.use_value_replay = use_value_replay self.use_reward_prediction = use_reward_prediction self.local_t_max = local_t_max self.n_step_TD = n_step_TD self.gamma = gamma self.gamma_pc = gamma_pc self.experience_history_size = experience_history_size self.max_global_time_step = max_global_time_step self.action_size = Environment.get_action_size(env_type, env_name) self.objective_size = Environment.get_objective_size( env_type, env_name) self.segnet_param_dict = segnet_param_dict self.segnet_mode = self.segnet_param_dict.get("segnet_mode", None) self.is_training = is_training self.n_classes = n_classes self.segnet_lambda = segnet_lambda self.run_metadata = tf.RunMetadata() self.many_runs_timeline = TimeLiner() self.random_state = random_state self.termination_time = termination_time self.dropout = dropout try: self.local_network = UnrealModel( self.action_size, self.objective_size, thread_index, use_lstm, use_pixel_change, use_value_replay, use_reward_prediction, pixel_change_lambda, entropy_beta, device, segnet_param_dict=self.segnet_param_dict, image_shape=image_shape, is_training=is_training, n_classes=n_classes, segnet_lambda=self.segnet_lambda, dropout=dropout) self.local_network.prepare_loss() self.apply_gradients = grad_applier.minimize_local( self.local_network.total_loss, global_network.get_vars(), self.local_network.get_vars(), self.thread_index) self.sync = self.local_network.sync_from(global_network) self.experience = Experience(self.experience_history_size, random_state=self.random_state) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # For log output self.prev_local_t = -1 self.prev_local_t_loss = 0 self.sr_size = 50 self.success_rates = deque(maxlen=self.sr_size) except Exception as e: print(str(e)) #, flush=True) raise Exception( "Problem in Trainer {} initialization".format(thread_index)) def prepare(self, termination_time=50.0, termination_dist_value=-10.0): self.environment = Environment.create_environment( self.env_type, self.env_name, self.termination_time, thread_index=self.thread_index) def stop(self): self.environment.stop() def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * ( self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, pi_values): return self.random_state.choice(len(pi_values), p=pi_values) def _record_one(self, sess, summary_writer, summary_op, score_input, score, global_t): if self.thread_index >= 0: summary_str = sess.run(summary_op, feed_dict={score_input: score}) for sum_wr in summary_writer: sum_wr.add_summary(summary_str, global_t) def _record_all(self, sess, summary_writer, summary_op, dict_input, dict_eval, global_t): if self.thread_index >= 0: assert set(dict_input.keys()) == set(dict_eval.keys()), print( dict_input.keys(), dict_eval.keys()) feed_dict = {} for key in dict_input.keys(): feed_dict.update({dict_input[key]: dict_eval[key]}) summary_str = sess.run(summary_op, feed_dict=feed_dict) for sum_wr in summary_writer: sum_wr.add_summary(summary_str, global_t) def set_start_time(self, start_time): self.start_time = start_time def _fill_experience(self, sess): """ Fill experience buffer until buffer is full. """ #print("Start experience filling", flush=True) prev_state = self.environment.last_state last_action = self.environment.last_action last_reward = self.environment.last_reward last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward, prev_state) #print("Local network run base policy, value!", flush=True) pi_, _, _ = self.local_network.run_base_policy_and_value( sess, self.environment.last_state, last_action_reward) action = self.choose_action(pi_) new_state, reward, terminal, pixel_change = self.environment.process( action, flag=0) frame = ExperienceFrame( { key: val for key, val in prev_state.items() if 'objectType' not in key }, reward, action, terminal, pixel_change, last_action, last_reward) self.experience.add_frame(frame) if terminal: self.environment.reset() if self.experience.is_full(): self.environment.reset() print("Replay buffer filled") def _print_log(self, global_t): if (self.thread_index == 0) and (self.local_t - self.prev_local_t >= PERFORMANCE_LOG_INTERVAL): self.prev_local_t += PERFORMANCE_LOG_INTERVAL elapsed_time = time.time() - self.start_time steps_per_sec = global_t / elapsed_time print( "### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour" .format(global_t, elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.)) #, flush=True) # print("### Experience : {}".format(self.experience.get_debug_string())) def _process_base(self, sess, global_t, summary_writer, summary_op_dict, summary_dict): #, losses_input): # [Base A3C] states = [] last_action_rewards = [] actions = [] rewards = [] values = [] terminal_end = False start_lstm_state = None if self.use_lstm: start_lstm_state = self.local_network.base_lstm_state_out mode = "segnet" if self.segnet_mode >= 2 else "" # t_max times loop flag = 0 for _ in range(self.n_step_TD): # Prepare last action reward last_action = self.environment.last_action last_reward = self.environment.last_reward last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward, self.environment.last_state) pi_, value_, losses = self.local_network.run_base_policy_and_value( sess, self.environment.last_state, last_action_reward, mode) action = self.choose_action(pi_) states.append(self.environment.last_state) last_action_rewards.append(last_action_reward) actions.append(action) values.append(value_) if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0): print("Trainer {}>>> Local step {}:".format( self.thread_index, self.local_t)) print("Trainer {}>>> pi={}".format(self.thread_index, pi_)) print("Trainer {}>>> V={}".format(self.thread_index, value_)) flag = 1 prev_state = self.environment.last_state # Process game new_state, reward, terminal, pixel_change = self.environment.process( action, flag=flag) frame = ExperienceFrame( { key: val for key, val in prev_state.items() if 'objectType' not in key }, reward, action, terminal, pixel_change, last_action, last_reward) # Store to experience self.experience.add_frame(frame) # Use to know about Experience collection #print(self.experience.get_debug_string()) self.episode_reward += reward rewards.append(reward) self.local_t += 1 if terminal: terminal_end = True print("Trainer {}>>> score={}".format( self.thread_index, self.episode_reward)) #, flush=True) summary_dict['values'].update( {'score_input': self.episode_reward}) success = 1 if self.environment._last_full_state[ "success"] else 0 #print("Type:", type(self.environment._last_full_state["success"]), len(self.success_rates), success) self.success_rates.append(success) summary_dict['values'].update({ 'sr_input': np.mean(self.success_rates) if len(self.success_rates) == self.sr_size else 0 }) self.episode_reward = 0 self.environment.reset() self.local_network.reset_state() if flag: flag = 0 break R = 0.0 if not terminal_end: R = self.local_network.run_base_value( sess, new_state, frame.get_action_reward(self.action_size)) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_a = [] batch_adv = [] batch_R = [] batch_sobjT = [] for (ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + self.gamma * R adv = R - Vi a = np.zeros([self.action_size]) a[ai] = 1.0 batch_si.append(si['image']) batch_a.append(a) batch_adv.append(adv) batch_R.append(R) if self.segnet_param_dict["segnet_mode"] >= 2: batch_sobjT.append(si['objectType']) batch_si.reverse() batch_a.reverse() batch_adv.reverse() batch_R.reverse() batch_sobjT.reverse() #print(np.unique(batch_sobjT)) ## HERE Mathematical Error A3C: only last values should be used for base/ or aggregate with last made return batch_si, batch_sobjT, last_action_rewards, batch_a, batch_adv, batch_R, start_lstm_state def _process_pc(self, sess): # [pixel change] # Sample 20+1 frame (+1 for last next state) #print(">>> Process run!", flush=True) pc_experience_frames = self.experience.sample_sequence( self.local_t_max + 1) # Reverse sequence to calculate from the last # pc_experience_frames.reverse() pc_experience_frames = pc_experience_frames[::-1] #print(">>> Process ran!", flush=True) batch_pc_si = [] batch_pc_a = [] batch_pc_R = [] batch_pc_last_action_reward = [] pc_R = np.zeros([20, 20], dtype=np.float32) if not pc_experience_frames[1].terminal: pc_R = self.local_network.run_pc_q_max( sess, pc_experience_frames[0].state, pc_experience_frames[0].get_last_action_reward( self.action_size)) #print(">>> Process run!", flush=True) for frame in pc_experience_frames[1:]: pc_R = frame.pixel_change + self.gamma_pc * pc_R a = np.zeros([self.action_size]) a[frame.action] = 1.0 last_action_reward = frame.get_last_action_reward(self.action_size) batch_pc_si.append(frame.state['image']) batch_pc_a.append(a) batch_pc_R.append(pc_R) batch_pc_last_action_reward.append(last_action_reward) batch_pc_si.reverse() batch_pc_a.reverse() batch_pc_R.reverse() batch_pc_last_action_reward.reverse() #print(">>> Process ended!", flush=True) return batch_pc_si, batch_pc_last_action_reward, batch_pc_a, batch_pc_R def _process_vr(self, sess): # [Value replay] # Sample 20+1 frame (+1 for last next state) vr_experience_frames = self.experience.sample_sequence( self.local_t_max + 1) # Reverse sequence to calculate from the last vr_experience_frames.reverse() batch_vr_si = [] batch_vr_R = [] batch_vr_last_action_reward = [] vr_R = 0.0 if not vr_experience_frames[1].terminal: vr_R = self.local_network.run_vr_value( sess, vr_experience_frames[0].state, vr_experience_frames[0].get_last_action_reward( self.action_size)) # t_max times loop for frame in vr_experience_frames[1:]: vr_R = frame.reward + self.gamma * vr_R batch_vr_si.append(frame.state['image']) batch_vr_R.append(vr_R) last_action_reward = frame.get_last_action_reward(self.action_size) batch_vr_last_action_reward.append(last_action_reward) batch_vr_si.reverse() batch_vr_R.reverse() batch_vr_last_action_reward.reverse() return batch_vr_si, batch_vr_last_action_reward, batch_vr_R def _process_rp(self): # [Reward prediction] rp_experience_frames = self.experience.sample_rp_sequence() # 4 frames batch_rp_si = [] batch_rp_c = [] for i in range(3): batch_rp_si.append(rp_experience_frames[i].state['image']) # one hot vector for target reward r = rp_experience_frames[3].reward rp_c = [0.0, 0.0, 0.0] if -1e-10 < r < 1e-10: rp_c[0] = 1.0 # zero elif r > 0: rp_c[1] = 1.0 # positive else: rp_c[2] = 1.0 # negative batch_rp_c.append(rp_c) return batch_rp_si, batch_rp_c def process(self, sess, global_t, summary_writer, summary_op_dict, score_input, sr_input, eval_input, entropy_input, term_global_t, losses_input): if self.prev_local_t == -1 and self.segnet_mode >= 2: self.prev_local_t = 0 sess.run(self.local_network.reset_evaluation_vars) # Fill experience replay buffer #print("Inside train process of thread!", flush=True) if not self.experience.is_full(): self._fill_experience(sess) return 0, None start_local_t = self.local_t episode_score = None cur_learning_rate = self._anneal_learning_rate(global_t) #print("Weights copying!", flush=True) # Copy weights from shared to local sess.run(self.sync) #print("Weights copied successfully!", flush=True) summary_dict = {'placeholders': {}, 'values': {}} summary_dict['placeholders'].update(losses_input) # [Base] #print("[Base]", flush=True) batch_si, batch_sobjT, batch_last_action_rewards, batch_a, batch_adv, batch_R, start_lstm_state, = \ self._process_base(sess, global_t, summary_writer, summary_op_dict, summary_dict) if summary_dict['values'].get('score_input', None) is not None: self._record_one(sess, summary_writer, summary_op_dict['score_input'], score_input, summary_dict['values']['score_input'], global_t) self._record_one(sess, summary_writer, summary_op_dict['sr_input'], sr_input, summary_dict['values']['sr_input'], global_t) #self._record_one(sess, summary_writer, summary_op_dict['term_global_t'], term_global_t, # global_t, global_t) #summary_writer[0].flush() # summary_writer[1].flush() # Return advanced local step size episode_score = summary_dict['values'].get('score_input', None) summary_dict['values'] = {} feed_dict = { self.local_network.base_input: batch_si, self.local_network.base_last_action_reward_input: batch_last_action_rewards, self.local_network.base_a: batch_a, self.local_network.base_adv: batch_adv, self.local_network.base_r: batch_R, # [common] self.learning_rate_input: cur_learning_rate, self.is_training: True } if self.use_lstm: feed_dict[ self.local_network.base_initial_lstm_state] = start_lstm_state if self.segnet_param_dict["segnet_mode"] >= 2: feed_dict[self.local_network.base_segm_mask] = batch_sobjT #print("[Pixel change]", flush=True) # [Pixel change] if self.use_pixel_change: batch_pc_si, batch_pc_last_action_reward, batch_pc_a, batch_pc_R = self._process_pc( sess) pc_feed_dict = { self.local_network.pc_input: batch_pc_si, self.local_network.pc_last_action_reward_input: batch_pc_last_action_reward, self.local_network.pc_a: batch_pc_a, self.local_network.pc_r: batch_pc_R } feed_dict.update(pc_feed_dict) #print("[Value replay]", flush=True) # [Value replay] if self.use_value_replay: batch_vr_si, batch_vr_last_action_reward, batch_vr_R = self._process_vr( sess) vr_feed_dict = { self.local_network.vr_input: batch_vr_si, self.local_network.vr_last_action_reward_input: batch_vr_last_action_reward, self.local_network.vr_r: batch_vr_R } feed_dict.update(vr_feed_dict) # [Reward prediction] #print("[Reward prediction]", flush=True) if self.use_reward_prediction: batch_rp_si, batch_rp_c = self._process_rp() rp_feed_dict = { self.local_network.rp_input: batch_rp_si, self.local_network.rp_c_target: batch_rp_c } feed_dict.update(rp_feed_dict) #print(len(batch_rp_c), batch_rp_c) grad_check = None #if self.local_t - self.prev_local_t_loss >= LOSS_AND_EVAL_LOG_INTERVAL: # grad_check = [tf.add_check_numerics_ops()] #print("Applying gradients in train!", flush=True) # Calculate gradients and copy them to global network. out_list = [self.apply_gradients] out_list += [ self.local_network.total_loss, self.local_network.base_loss, self.local_network.policy_loss, self.local_network.value_loss, self.local_network.entropy ] if self.segnet_mode >= 2: out_list += [self.local_network.decoder_loss] out_list += [self.local_network.regul_loss] if self.use_pixel_change: out_list += [self.local_network.pc_loss] if self.use_value_replay: out_list += [self.local_network.vr_loss] if self.use_reward_prediction: out_list += [self.local_network.rp_loss] if self.segnet_mode >= 2: out_list += [self.local_network.update_evaluation_vars] if self.local_t - self.prev_local_t_loss >= LOSS_AND_EVAL_LOG_INTERVAL: out_list += [self.local_network.evaluation] import time now = time.time() with tf.control_dependencies(grad_check): if GPU_LOG: return_list = sess.run(out_list, feed_dict=feed_dict, options=run_options, run_metadata=self.run_metadata) else: return_list = sess.run(out_list, feed_dict=feed_dict, options=run_options) if time.time() - now > 30.0: print( "Too much time on sess.run: check tensorflow") #, flush=True) sys.exit(0) raise ValueError("More than 100 seconds update in tensorflow!") # gradients_tuple, total_loss, base_loss, policy_loss, value_loss, entropy = return_list[: 6] grad_norm = gradients_tuple[1] return_list = return_list[6:] return_string = "Trainer {}>>> Total loss: {}, Base loss: {}\n".format( self.thread_index, total_loss, base_loss) return_string += "\t\tPolicy loss: {}, Value loss: {}, Grad norm: {}\nEntropy: {}\n".format( policy_loss, value_loss, grad_norm, entropy) losses_eval = { 'all/total_loss': total_loss, 'all/base_loss': base_loss, 'all/policy_loss': policy_loss, 'all/value_loss': value_loss, 'all/loss/grad_norm': grad_norm } if self.segnet_mode >= 2: decoder_loss, l2_loss = return_list[:2] return_list = return_list[2:] return_string += "\t\tDecoder loss: {}, L2 weights loss: {}\n".format( decoder_loss, l2_loss) losses_eval.update({ 'all/decoder_loss': decoder_loss, 'all/l2_weights_loss': l2_loss }) if self.use_pixel_change: pc_loss = return_list[0] return_list = return_list[1:] return_string += "\t\tPC loss: {}\n".format(pc_loss) losses_eval.update({'all/pc_loss': pc_loss}) if self.use_value_replay: vr_loss = return_list[0] return_list = return_list[1:] return_string += "\t\tVR loss: {}\n".format(vr_loss) losses_eval.update({'all/vr_loss': vr_loss}) if self.use_reward_prediction: rp_loss = return_list[0] return_list = return_list[1:] return_string += "\t\tRP loss: {}\n".format(rp_loss) losses_eval.update({'all/rp_loss': rp_loss}) if self.local_t - self.prev_local_t_loss >= LOSS_AND_EVAL_LOG_INTERVAL: if self.segnet_mode >= 2: return_string += "\t\tmIoU: {}\n".format(return_list[-1]) summary_dict['values'].update(losses_eval) # Printing losses if self.local_t - self.prev_local_t_loss >= LOSS_AND_EVAL_LOG_INTERVAL: if self.segnet_mode >= 2: self._record_one(sess, summary_writer, summary_op_dict['eval_input'], eval_input, return_list[-1], global_t) self._record_one(sess, summary_writer, summary_op_dict['entropy'], entropy_input, entropy, global_t) # summary_writer[0].flush() # summary_writer[1].flush() print(return_string) self.prev_local_t_loss += LOSS_AND_EVAL_LOG_INTERVAL if GPU_LOG: fetched_timeline = timeline.Timeline(self.run_metadata.step_stats) chrome_trace = fetched_timeline.generate_chrome_trace_format() self.many_runs_timeline.update_timeline(chrome_trace) self._print_log(global_t) #Recording score and losses self._record_all(sess, summary_writer, summary_op_dict['losses_input'], summary_dict['placeholders'], summary_dict['values'], global_t) # Return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t, episode_score
class Tester(object): def __init__(self): self.img = np.zeros(shape=(HEIGHT, WIDTH, 3), dtype=np.uint8) self.action_size = Environment.get_action_size() self.global_network = UnrealModel(self.action_size, -1, "/cpu:0", for_display=True) self.env = Environment.create_environment() self.value_history = ValueHistory() self.state_history = StateHistory() self.ep_reward = 0 self.mazemap = MazeMap() def process(self, sess): self.img = np.zeros(shape=(HEIGHT, WIDTH, 3), dtype=np.uint8) last_action = self.env.last_action last_reward = np.clip(self.env.last_reward, -1, 1) last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward) if not USE_PIXEL_CHANGE: pi_values, v_value = self.global_network.run_base_policy_and_value( sess, self.env.last_state, last_action_reward) else: pi_values, v_value, pc_q = self.global_network.run_base_policy_value_pc_q( sess, self.env.last_state, last_action_reward) self.value_history.add_value(v_value) action = self.choose_action(pi_values) state, reward, terminal, pc, vtrans, vrot = self.env.process(action) self.state_history.add_state(state) self.ep_reward += reward self.mazemap.update(vtrans, vrot) if reward > 9: # agent到达迷宫终点时,reward为10,地图需要重置 self.mazemap.reset() if terminal: # lab环境默认3600帧为一个episode而不是到达迷宫终点时给terminal信号 self.env.reset() self.ep_reward = 0 self.mazemap.reset() self.show_ob(state, 3, 3, "Observation") self.show_pc(pc, 100, 3, 3.0, "Pixel Change") self.show_pc(pc_q[:, :, action], 200, 3, 0.4, "PC Q") self.show_map(300, 3, "Maze Map") self.show_pi(pi_values) self.show_reward() self.show_rp() self.show_value() def choose_action(self, pi_values): return np.random.choice(range(len(pi_values)), p=pi_values) def scale_image(self, image, scale): return image.repeat(scale, axis=0).repeat(scale, axis=1) def draw_text(self, text, left, bottom, color=WHITE): font = cv2.FONT_HERSHEY_COMPLEX cv2.putText(self.img, text, (left, bottom), font, 0.35, color) def show_pc(self, pc, left, top, rate, label): pc = np.clip(pc * 255.0 * rate, 0.0, 255.0) data = pc.astype(np.uint8) data = np.stack([data for _ in range(3)], axis=2) data = self.scale_image(data, 4) h = data.shape[0] w = data.shape[1] self.img[top:top + h, left:left + w, :] = data self.draw_text(label, (left + 2), (top + h + 15)) def show_map(self, left, top, label): maze = self.mazemap.get_map(84, 84) maze = (maze * 255).astype(np.uint8) h = maze.shape[0] w = maze.shape[1] self.img[top:top + h, left:left + w, :] = maze self.draw_text(label, (left + 2), (top + h + 5)) def show_pi(self, pi): for i in range(len(pi)): width = int(pi[i] * 100) cv2.rectangle(self.img, (3, 113 + 15 * i), (width, 120 + 15 * i), WHITE) self.draw_text("Policy", 20, 120 + 15 * len(pi)) def show_ob(self, state, left, top, label): state = (state * 255.0).astype(np.uint8) h = state.shape[0] w = state.shape[1] self.img[top:top + h, left:left + w, :] = state self.draw_text(label, (left + 2), (top + h + 15)) def show_value(self, left, top, height, width): if self.value_history.is_empty: return min_v = float("inf") max_v = float("-inf") values = self.value_history.values for v in values: min_v = min(min_v, v) max_v = max(max_v, v) bottom = top + height right = left + width d = max_v - min_v last_r = 0.0 for i, v in enumerate(values): r = (v - min_v) / d if i > 0: x0 = i - 1 + left x1 = i + left y0 = bottom - last_r * height y1 = bottom - r * height cv2.line(self.img, (y0, x0), (y1, x1), BLUE, 2) last_r = r cv2.line(self.img, (top, left), (bottom, left), WHITE, 1) cv2.line(self.img, (top, right), (bottom, right), WHITE, 1) cv2.line(self.img, (top, left), (top, right), WHITE, 1) cv2.line(self.img, (bottom, left), (bottom, right), WHITE, 1) self.draw_text("Q Value", 120, 215) def show_rp(self): pass def show_reward(self): self.draw_text("Reward: {}".format(int(self.ep_reward)), 10, 230) def get_frame(self): return self.img
device = "/cpu:0" if USE_GPU: device = "/gpu:0" initial_learning_rate = log_uniform(INITIAL_ALPHA_LOW, INITIAL_ALPHA_HIGH, INITIAL_ALPHA_LOG_RATE) global_t = 0 stop_requested = False terminate_reqested = False action_size = Environment.get_action_size() global_network = UnrealModel(action_size, -1, device) trainers = [] learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate = learning_rate_input, decay = RMSP_ALPHA, momentum = 0.0, epsilon = RMSP_EPSILON, clip_norm = GRAD_NORM_CLIP, device = device) for i in range(PARALLEL_SIZE): trainer = Trainer(i, global_network,
device = "/cpu:0" if USE_GPU: device = "/gpu:0" initial_learning_rate = log_uniform(INITIAL_ALPHA_LOW, INITIAL_ALPHA_HIGH, INITIAL_ALPHA_LOG_RATE) global_t = 0 stop_requested = False terminate_reqested = False action_size = Environment.get_action_size() global_network = UnrealModel(action_size, -1, device) trainers = [] learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate=learning_rate_input, decay=RMSP_ALPHA, momentum=0.0, epsilon=RMSP_EPSILON, clip_norm=GRAD_NORM_CLIP, device=device) for i in range(PARALLEL_SIZE): trainer = Trainer(i, global_network,
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, env_type, env_name, use_pixel_change, use_value_replay, use_reward_prediction, pixel_change_lambda, entropy_beta, vf_coeff, local_t_max, gamma, gamma_pc, experience_history_size, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.env_type = env_type self.env_name = env_name self.use_pixel_change = use_pixel_change self.use_value_replay = use_value_replay self.use_reward_prediction = use_reward_prediction self.local_t_max = local_t_max self.gamma = gamma self.gamma_pc = gamma_pc self.experience_history_size = experience_history_size self.max_global_time_step = max_global_time_step self.action_size = Environment.get_action_size(env_type, env_name) self.local_network = UnrealModel(self.action_size, thread_index, use_pixel_change, use_value_replay, use_reward_prediction, pixel_change_lambda, entropy_beta, vf_coeff, device) #self.local_network.prepare_loss() #adding things for acktr self.local_network.prepare_loss_acktr() self.optim = optim = KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\ momentum=0.9, kfac_update=1, epsilon=0.01,\ stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm) update_stats_op = optim.compute_and_apply_stats(self.local_network.fisher_loss, var_list=self.local_network.params) train_op, q_runner = optim.apply_gradients(list(zip(self.local_network.grads,self.local_network.params))) #update the rest according to normal stuff self.apply_gradients = grad_applier.minimize_local(self.local_network.intermediate_loss, global_network.get_vars(), self.local_network.get_vars()) self.sync = self.local_network.sync_from(global_network) self.experience = Experience(self.experience_history_size) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # For log output self.prev_local_t = 0
def run(self): device = "/cpu:0" if USE_GPU: device = "/gpu:0" self.print_flags_info() if flags.segnet == -1: with open(flags.segnet_config) as f: self.config = json.load(f) self.num_classes = self.config["NUM_CLASSES"] self.use_vgg = self.config["USE_VGG"] if self.use_vgg is False: self.vgg_param_dict = None print("No VGG path in config, so learning from scratch") else: self.vgg16_npy_path = self.config["VGG_FILE"] self.vgg_param_dict = np.load(self.vgg16_npy_path, encoding='latin1').item() print("VGG parameter loaded") self.bayes = self.config["BAYES"] segnet_param_dict = {'segnet_mode': flags.segnet, 'vgg_param_dict': self.vgg_param_dict, 'use_vgg': self.use_vgg, 'num_classes': self.num_classes, 'bayes': self.bayes} else: # 0, 1, 2, 3 segnet_param_dict = {'segnet_mode': flags.segnet} if flags.env_type != 'indoor': env_config = {} else: env_config = sim_config.get(flags.env_name) self.image_shape = [env_config.get('height', 84), env_config.get('width', 84)] self.map_file = env_config.get('objecttypes_file', '../../objectTypes_1x.csv') initial_learning_rate = log_uniform(flags.initial_alpha_low, flags.initial_alpha_high, flags.initial_alpha_log_rate) self.global_t = 0 self.stop_requested = False self.terminate_requested = False action_size = Environment.get_action_size(flags.env_type, flags.env_name) objective_size = Environment.get_objective_size(flags.env_type, flags.env_name) is_training = tf.placeholder(tf.bool, name="training") self.random_state = np.random.RandomState(seed=env_config.get("seed", 0xA3C)) print("Global network initializing!")#, flush=True) self.global_network = UnrealModel(action_size, objective_size, -1, flags.use_lstm, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, flags.pixel_change_lambda, flags.entropy_beta, device, segnet_param_dict=segnet_param_dict, image_shape=self.image_shape, is_training=is_training, n_classes=flags.n_classes, segnet_lambda=flags.segnet_lambda, dropout=flags.dropout) self.trainers = [] learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate = learning_rate_input, decay = flags.rmsp_alpha, momentum = 0.0, epsilon = flags.rmsp_epsilon, clip_norm = flags.grad_norm_clip, device = device) for i in range(flags.parallel_size): trainer = Trainer(i, self.global_network, initial_learning_rate, learning_rate_input, grad_applier, flags.env_type, flags.env_name, flags.use_lstm, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, flags.pixel_change_lambda, flags.entropy_beta, flags.local_t_max, flags.n_step_TD, flags.gamma, flags.gamma_pc, flags.experience_history_size, flags.max_time_step, device, segnet_param_dict=segnet_param_dict, image_shape=self.image_shape, is_training=is_training, n_classes = flags.n_classes, random_state=self.random_state, termination_time=flags.termination_time_sec, segnet_lambda=flags.segnet_lambda, dropout=flags.dropout) self.trainers.append(trainer) self.last_scores = [] self.best_score = -1.0 # prepare session config = tf.ConfigProto(allow_soft_placement = True, log_device_placement = False) config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) # Wrap sess.run for debugging messages! def run_(*args, **kwargs): #print(">>> RUN!", args[0] if args else None)#, flush=True) return self.sess.__run(*args, **kwargs) # getattr(self, "__run")(self, *args, **kwargs) self.sess.__run, self.sess.run = self.sess.run, run_ self.sess.run(tf.global_variables_initializer()) # summary for tensorboard self.score_input = tf.placeholder(tf.float32) self.sr_input = tf.placeholder(tf.float32) self.mIoU_input = tf.placeholder(tf.float32) self.term_global_t = tf.placeholder(tf.int32) self.losses_input = {} self.total_loss = tf.placeholder(tf.float32, name='total_loss') self.losses_input.update({'all/total_loss': self.total_loss}) self.base_loss = tf.placeholder(tf.float32, name='base_loss') self.losses_input.update({'all/base_loss': self.base_loss}) self.policy_loss = tf.placeholder(tf.float32, name='policy_loss') self.losses_input.update({'all/policy_loss': self.policy_loss}) self.value_loss = tf.placeholder(tf.float32, name='policy_loss') self.losses_input.update({'all/value_loss': self.value_loss}) self.grad_norm = tf.placeholder(tf.float32, name='grad_norm') self.losses_input.update({'all/loss/grad_norm': self.grad_norm}) self.entropy_input = tf.placeholder(tf.float32, shape=[None], name='entropy') if segnet_param_dict["segnet_mode"] >= 2: self.decoder_loss = tf.placeholder(tf.float32, name='decoder_loss') self.losses_input.update({'all/decoder_loss': self.decoder_loss}) self.l2_weights_loss = tf.placeholder(tf.float32, name='regul_weights_loss') self.losses_input.update({'all/l2_weights_loss': self.l2_weights_loss}) if flags.use_pixel_change: self.pc_loss = tf.placeholder(tf.float32, name='pc_loss') self.losses_input.update({'all/pc_loss': self.pc_loss}) if flags.use_value_replay: self.vr_loss = tf.placeholder(tf.float32, name='vr_loss') self.losses_input.update({'all/vr_loss': self.vr_loss}) if flags.use_reward_prediction: self.rp_loss = tf.placeholder(tf.float32, name='rp_loss') self.losses_input.update({'all/rp_loss': self.rp_loss}) score_summary = tf.summary.scalar("all/eval/score", self.score_input) sr_summary = tf.summary.scalar("all/eval/success_rate", self.sr_input) term_summary = tf.summary.scalar("all/eval/term_global_t", self.term_global_t) eval_summary = tf.summary.scalar("all/eval/mIoU_all", self.mIoU_input) losses_summary_list = [] for key, val in self.losses_input.items(): losses_summary_list += [tf.summary.scalar(key, val)] self.summary_op_dict = {'score_input': score_summary, 'eval_input': eval_summary, 'sr_input':sr_summary, 'losses_input': tf.summary.merge(losses_summary_list), 'entropy': tf.summary.scalar('all/eval/entropy_stepTD', tf.reduce_mean(self.entropy_input)), 'term_global_t': term_summary} flags.checkpoint_dir = os.path.join(base_dir, flags.checkpoint_dir) #print("First dirs {}::{}".format(flags.log_dir, flags.checkpoint_dir)) flags.checkpoint_dir = flags.checkpoint_dir print("Checkpoint dir: {}, Log dir: {}".format(flags.checkpoint_dir, flags.log_dir)) overall_FW = tf.summary.FileWriter(os.path.join(flags.log_dir, 'overall'), self.sess.graph) self.summary_writer = [(tf.summary.FileWriter(os.path.join(flags.log_dir, 'worker_{}'.format(i)), self.sess.graph), overall_FW) for i in range(flags.parallel_size)] # init or load checkpoint with saver self.saver = tf.train.Saver(self.global_network.get_global_vars(), max_to_keep=20) #checkpoint = tf.train.get_checkpoint_state(flags.checkpoint_dir, latest_filename ="best-checkpoint") #if checkpoint is None or checkpoint.model_checkpoint_path is None: # checkpoint = tf.train.get_checkpoint_state(flags.checkpoint_dir) checkpoint = tf.train.get_checkpoint_state(flags.checkpoint_dir) if checkpoint and checkpoint.model_checkpoint_path: if flags.segnet == -1: from tensorflow.python import pywrap_tensorflow reader = pywrap_tensorflow.NewCheckpointReader(checkpoint.model_checkpoint_path) big_var_to_shape_map = reader.get_variable_to_shape_map() s = [] for key in big_var_to_shape_map: s += [key] # print("tensor_name: ", key) glob_var_names = [v.name for v in tf.global_variables()] endings = [r.split('/')[-1][:-2] for r in glob_var_names] old_ckpt_to_new_ckpt = {[k for k in s if endings[i] in k][0]: v for i, v in enumerate(tf.global_variables())} saver1 = tf.train.Saver(var_list=old_ckpt_to_new_ckpt) saver1.restore(self.sess, checkpoint.model_checkpoint_path) else: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) tokens = checkpoint.model_checkpoint_path.split("-") # set global step if 'best' in checkpoint.model_checkpoint_path: files = os.listdir(flags.checkpoint_dir) max_g_step = 0 max_best_score = -10 for file in files: if '.meta' not in file or 'checkpoint' not in file: continue if len(tokens) == 2: continue if len(tokens) > 3: best_score = float('-0.'+file.split('-')[2]) if 'best' in file else float('-0.'+file.split('-')[1]) if best_score > max_best_score: g_step = int(file.split('-')[3]).split('.')[0] if 'best' in file else int(file.split('-')[2].split('.')[0]) if max_g_step < g_step: max_g_step = g_step else: self.best_score = -1.0 g_step = int(file.split('-')[2]) if 'best' in file else int(file.split('-')[1]) if max_g_step < g_step: max_g_step = g_step self.best_score = max_best_score self.global_t = max_g_step print("Chosen g_step >>", g_step) else: if len(tokens) == 3: self.global_t = int(tokens[2]) else: self.global_t = int(tokens[1]) #for i in range(flags.parallel_size): # self.trainers[i].local_t = self.global_t print(">>> global step set: ", self.global_t) # set wall time wall_t_fname = flags.checkpoint_dir + '/' + 'wall_t.' + str(self.global_t) with open(wall_t_fname, 'r') as f: self.wall_t = float(f.read()) self.next_save_steps = (self.global_t + flags.save_interval_step) // flags.save_interval_step * flags.save_interval_step print_tensors_in_checkpoint_file(file_name=checkpoint.model_checkpoint_path, tensor_name='', all_tensors=False, all_tensor_names=True) else: print("Could not find old checkpoint") # set wall time self.wall_t = 0.0 self.next_save_steps = flags.save_interval_step print("Global step {}, max best score {}".format(self.global_t, self.best_score)) if flags.segnet_pretrain: checkpoint_dir = "../erfnet_segmentation/models" checkpoint_dir = os.path.join(checkpoint_dir, "aug_erfnetC_0_{}x{}_{}x/snapshots_best".format( self.image_shape[1], self.image_shape[0], self.map_file.split('_')[1].split('x')[0])) checkpoint = tf.train.get_checkpoint_state(checkpoint_dir) big_weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='net_-1/base_encoder') big_weights += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='net_-1/base_decoder') erfnet_weights = [l.name.split(':')[0].rsplit('net_-1/base_encoder/')[-1] for l in big_weights if len(l.name.split(':')[0].rsplit('net_-1/base_encoder/')) == 2] erfnet_weights += [l.name.split(':')[0].rsplit('net_-1/base_decoder/')[-1] for l in big_weights if len(l.name.split(':')[0].rsplit('net_-1/base_decoder/')) == 2] if checkpoint and checkpoint.model_checkpoint_path: saver2 = tf.train.Saver(var_list=dict(zip(erfnet_weights, big_weights))) saver2.restore(self.sess, checkpoint.model_checkpoint_path) print("ErfNet pretrained weights restored from file ", checkpoint_dir) else: print("Can't load pretrained weights for ErfNet from file ", checkpoint_dir) # run training threads self.train_threads = [] for i in range(flags.parallel_size): self.train_threads.append(threading.Thread(target=self.train_function, args=(i,True))) signal.signal(signal.SIGINT, self.signal_handler) # set start time self.start_time = time.time() - self.wall_t print("Ready to start") for t in self.train_threads: t.start() print('Press Ctrl+C to stop')#, flush=True) signal.pause()
class Application(object): def __init__(self): pass def train_function(self, parallel_index, preparing): """ Train each environment. """ trainer = self.trainers[parallel_index] try: if preparing: trainer.prepare() except Exception as e: print(str(e))#, flush=True) raise Exception("Problem with trainer environment creation") # set start_time trainer.set_start_time(self.start_time) print("Trainer ", parallel_index, " process (re)start!") prev_print_t = 0 while True: if self.global_t - prev_print_t >= 1000 or not prev_print_t and self.global_t != prev_print_t: prev_print_t = self.global_t print("Trainer {0}>>> stop_requested: {1}, terminate_requested: {2}, global_t: {3}".format(parallel_index, self.stop_requested, self.terminate_requested, self.global_t)) #if parallel_index == 0: # print("Graph size is {}".format( # len([n.name for n in self.sess.graph.as_graph_def().node]))) if self.stop_requested: print("Trainer ", parallel_index, ": stop requested!") break if self.terminate_requested: print("Trainer ", parallel_index, ": terminate_requested => process stop!") trainer.stop() break if self.global_t > flags.max_time_step: print("Trainer ", parallel_index, ": end of training!") trainer.stop() break if parallel_index == 0 and self.global_t > self.next_save_steps: # Save checkpoint self.save() try: diff_global_t, score = trainer.process(self.sess, self.global_t, self.summary_writer[parallel_index], self.summary_op_dict, self.score_input, self.sr_input, self.mIoU_input, self.entropy_input, self.term_global_t, self.losses_input) self.global_t += diff_global_t # if parallel_index == 0 and score is not None: # #print("Got score", flush=True) # self.last_scores += [score] # if len(self.last_scores) >= 50: # print("Last scores len >= 50") # cur_score = np.mean(self.last_scores) # print("Best score: {}, Cur score: {}".format(self.best_score, cur_score)) # self.last_scores = self.last_scores[-10:] # if cur_score > self.best_score: # self.best_score = cur_score # self.save(name="best-checkpoint") # [n.name for n in tf.get_default_graph().as_graph_def().node] # [op for op in tf.get_default_graph().get_operations()] #op.name # GPU logging memory # prev_runs_t = 0 # if self.global_t - prev_runs_t > 1000 or prev_runs_t == 0: # prev_runs_t = self.global_t # trainer.many_runs_timeline.save(os.path.join(flags.checkpoint_dir, # 'timeline_{}_merged_{}_runs.json'.format( # parallel_index, self.global_t))) # else: # trainer.many_runs_timeline.save(os.path.join(flags.checkpoint_dir, # 'timeline_{}_merged_{}_runs.json'.format( # parallel_index, prev_runs_t))) # except Exception as e: print(traceback.format_exc())#, flush=True) trainer.stop() ## Let it be here!!! print("Trainer ", parallel_index, " process Error!")#, flush=True) break print("Trainer ", parallel_index, " after a while return!") def run(self): device = "/cpu:0" if USE_GPU: device = "/gpu:0" self.print_flags_info() if flags.segnet == -1: with open(flags.segnet_config) as f: self.config = json.load(f) self.num_classes = self.config["NUM_CLASSES"] self.use_vgg = self.config["USE_VGG"] if self.use_vgg is False: self.vgg_param_dict = None print("No VGG path in config, so learning from scratch") else: self.vgg16_npy_path = self.config["VGG_FILE"] self.vgg_param_dict = np.load(self.vgg16_npy_path, encoding='latin1').item() print("VGG parameter loaded") self.bayes = self.config["BAYES"] segnet_param_dict = {'segnet_mode': flags.segnet, 'vgg_param_dict': self.vgg_param_dict, 'use_vgg': self.use_vgg, 'num_classes': self.num_classes, 'bayes': self.bayes} else: # 0, 1, 2, 3 segnet_param_dict = {'segnet_mode': flags.segnet} if flags.env_type != 'indoor': env_config = {} else: env_config = sim_config.get(flags.env_name) self.image_shape = [env_config.get('height', 84), env_config.get('width', 84)] self.map_file = env_config.get('objecttypes_file', '../../objectTypes_1x.csv') initial_learning_rate = log_uniform(flags.initial_alpha_low, flags.initial_alpha_high, flags.initial_alpha_log_rate) self.global_t = 0 self.stop_requested = False self.terminate_requested = False action_size = Environment.get_action_size(flags.env_type, flags.env_name) objective_size = Environment.get_objective_size(flags.env_type, flags.env_name) is_training = tf.placeholder(tf.bool, name="training") self.random_state = np.random.RandomState(seed=env_config.get("seed", 0xA3C)) print("Global network initializing!")#, flush=True) self.global_network = UnrealModel(action_size, objective_size, -1, flags.use_lstm, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, flags.pixel_change_lambda, flags.entropy_beta, device, segnet_param_dict=segnet_param_dict, image_shape=self.image_shape, is_training=is_training, n_classes=flags.n_classes, segnet_lambda=flags.segnet_lambda, dropout=flags.dropout) self.trainers = [] learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate = learning_rate_input, decay = flags.rmsp_alpha, momentum = 0.0, epsilon = flags.rmsp_epsilon, clip_norm = flags.grad_norm_clip, device = device) for i in range(flags.parallel_size): trainer = Trainer(i, self.global_network, initial_learning_rate, learning_rate_input, grad_applier, flags.env_type, flags.env_name, flags.use_lstm, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, flags.pixel_change_lambda, flags.entropy_beta, flags.local_t_max, flags.n_step_TD, flags.gamma, flags.gamma_pc, flags.experience_history_size, flags.max_time_step, device, segnet_param_dict=segnet_param_dict, image_shape=self.image_shape, is_training=is_training, n_classes = flags.n_classes, random_state=self.random_state, termination_time=flags.termination_time_sec, segnet_lambda=flags.segnet_lambda, dropout=flags.dropout) self.trainers.append(trainer) self.last_scores = [] self.best_score = -1.0 # prepare session config = tf.ConfigProto(allow_soft_placement = True, log_device_placement = False) config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) # Wrap sess.run for debugging messages! def run_(*args, **kwargs): #print(">>> RUN!", args[0] if args else None)#, flush=True) return self.sess.__run(*args, **kwargs) # getattr(self, "__run")(self, *args, **kwargs) self.sess.__run, self.sess.run = self.sess.run, run_ self.sess.run(tf.global_variables_initializer()) # summary for tensorboard self.score_input = tf.placeholder(tf.float32) self.sr_input = tf.placeholder(tf.float32) self.mIoU_input = tf.placeholder(tf.float32) self.term_global_t = tf.placeholder(tf.int32) self.losses_input = {} self.total_loss = tf.placeholder(tf.float32, name='total_loss') self.losses_input.update({'all/total_loss': self.total_loss}) self.base_loss = tf.placeholder(tf.float32, name='base_loss') self.losses_input.update({'all/base_loss': self.base_loss}) self.policy_loss = tf.placeholder(tf.float32, name='policy_loss') self.losses_input.update({'all/policy_loss': self.policy_loss}) self.value_loss = tf.placeholder(tf.float32, name='policy_loss') self.losses_input.update({'all/value_loss': self.value_loss}) self.grad_norm = tf.placeholder(tf.float32, name='grad_norm') self.losses_input.update({'all/loss/grad_norm': self.grad_norm}) self.entropy_input = tf.placeholder(tf.float32, shape=[None], name='entropy') if segnet_param_dict["segnet_mode"] >= 2: self.decoder_loss = tf.placeholder(tf.float32, name='decoder_loss') self.losses_input.update({'all/decoder_loss': self.decoder_loss}) self.l2_weights_loss = tf.placeholder(tf.float32, name='regul_weights_loss') self.losses_input.update({'all/l2_weights_loss': self.l2_weights_loss}) if flags.use_pixel_change: self.pc_loss = tf.placeholder(tf.float32, name='pc_loss') self.losses_input.update({'all/pc_loss': self.pc_loss}) if flags.use_value_replay: self.vr_loss = tf.placeholder(tf.float32, name='vr_loss') self.losses_input.update({'all/vr_loss': self.vr_loss}) if flags.use_reward_prediction: self.rp_loss = tf.placeholder(tf.float32, name='rp_loss') self.losses_input.update({'all/rp_loss': self.rp_loss}) score_summary = tf.summary.scalar("all/eval/score", self.score_input) sr_summary = tf.summary.scalar("all/eval/success_rate", self.sr_input) term_summary = tf.summary.scalar("all/eval/term_global_t", self.term_global_t) eval_summary = tf.summary.scalar("all/eval/mIoU_all", self.mIoU_input) losses_summary_list = [] for key, val in self.losses_input.items(): losses_summary_list += [tf.summary.scalar(key, val)] self.summary_op_dict = {'score_input': score_summary, 'eval_input': eval_summary, 'sr_input':sr_summary, 'losses_input': tf.summary.merge(losses_summary_list), 'entropy': tf.summary.scalar('all/eval/entropy_stepTD', tf.reduce_mean(self.entropy_input)), 'term_global_t': term_summary} flags.checkpoint_dir = os.path.join(base_dir, flags.checkpoint_dir) #print("First dirs {}::{}".format(flags.log_dir, flags.checkpoint_dir)) flags.checkpoint_dir = flags.checkpoint_dir print("Checkpoint dir: {}, Log dir: {}".format(flags.checkpoint_dir, flags.log_dir)) overall_FW = tf.summary.FileWriter(os.path.join(flags.log_dir, 'overall'), self.sess.graph) self.summary_writer = [(tf.summary.FileWriter(os.path.join(flags.log_dir, 'worker_{}'.format(i)), self.sess.graph), overall_FW) for i in range(flags.parallel_size)] # init or load checkpoint with saver self.saver = tf.train.Saver(self.global_network.get_global_vars(), max_to_keep=20) #checkpoint = tf.train.get_checkpoint_state(flags.checkpoint_dir, latest_filename ="best-checkpoint") #if checkpoint is None or checkpoint.model_checkpoint_path is None: # checkpoint = tf.train.get_checkpoint_state(flags.checkpoint_dir) checkpoint = tf.train.get_checkpoint_state(flags.checkpoint_dir) if checkpoint and checkpoint.model_checkpoint_path: if flags.segnet == -1: from tensorflow.python import pywrap_tensorflow reader = pywrap_tensorflow.NewCheckpointReader(checkpoint.model_checkpoint_path) big_var_to_shape_map = reader.get_variable_to_shape_map() s = [] for key in big_var_to_shape_map: s += [key] # print("tensor_name: ", key) glob_var_names = [v.name for v in tf.global_variables()] endings = [r.split('/')[-1][:-2] for r in glob_var_names] old_ckpt_to_new_ckpt = {[k for k in s if endings[i] in k][0]: v for i, v in enumerate(tf.global_variables())} saver1 = tf.train.Saver(var_list=old_ckpt_to_new_ckpt) saver1.restore(self.sess, checkpoint.model_checkpoint_path) else: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) tokens = checkpoint.model_checkpoint_path.split("-") # set global step if 'best' in checkpoint.model_checkpoint_path: files = os.listdir(flags.checkpoint_dir) max_g_step = 0 max_best_score = -10 for file in files: if '.meta' not in file or 'checkpoint' not in file: continue if len(tokens) == 2: continue if len(tokens) > 3: best_score = float('-0.'+file.split('-')[2]) if 'best' in file else float('-0.'+file.split('-')[1]) if best_score > max_best_score: g_step = int(file.split('-')[3]).split('.')[0] if 'best' in file else int(file.split('-')[2].split('.')[0]) if max_g_step < g_step: max_g_step = g_step else: self.best_score = -1.0 g_step = int(file.split('-')[2]) if 'best' in file else int(file.split('-')[1]) if max_g_step < g_step: max_g_step = g_step self.best_score = max_best_score self.global_t = max_g_step print("Chosen g_step >>", g_step) else: if len(tokens) == 3: self.global_t = int(tokens[2]) else: self.global_t = int(tokens[1]) #for i in range(flags.parallel_size): # self.trainers[i].local_t = self.global_t print(">>> global step set: ", self.global_t) # set wall time wall_t_fname = flags.checkpoint_dir + '/' + 'wall_t.' + str(self.global_t) with open(wall_t_fname, 'r') as f: self.wall_t = float(f.read()) self.next_save_steps = (self.global_t + flags.save_interval_step) // flags.save_interval_step * flags.save_interval_step print_tensors_in_checkpoint_file(file_name=checkpoint.model_checkpoint_path, tensor_name='', all_tensors=False, all_tensor_names=True) else: print("Could not find old checkpoint") # set wall time self.wall_t = 0.0 self.next_save_steps = flags.save_interval_step print("Global step {}, max best score {}".format(self.global_t, self.best_score)) if flags.segnet_pretrain: checkpoint_dir = "../erfnet_segmentation/models" checkpoint_dir = os.path.join(checkpoint_dir, "aug_erfnetC_0_{}x{}_{}x/snapshots_best".format( self.image_shape[1], self.image_shape[0], self.map_file.split('_')[1].split('x')[0])) checkpoint = tf.train.get_checkpoint_state(checkpoint_dir) big_weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='net_-1/base_encoder') big_weights += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='net_-1/base_decoder') erfnet_weights = [l.name.split(':')[0].rsplit('net_-1/base_encoder/')[-1] for l in big_weights if len(l.name.split(':')[0].rsplit('net_-1/base_encoder/')) == 2] erfnet_weights += [l.name.split(':')[0].rsplit('net_-1/base_decoder/')[-1] for l in big_weights if len(l.name.split(':')[0].rsplit('net_-1/base_decoder/')) == 2] if checkpoint and checkpoint.model_checkpoint_path: saver2 = tf.train.Saver(var_list=dict(zip(erfnet_weights, big_weights))) saver2.restore(self.sess, checkpoint.model_checkpoint_path) print("ErfNet pretrained weights restored from file ", checkpoint_dir) else: print("Can't load pretrained weights for ErfNet from file ", checkpoint_dir) # run training threads self.train_threads = [] for i in range(flags.parallel_size): self.train_threads.append(threading.Thread(target=self.train_function, args=(i,True))) signal.signal(signal.SIGINT, self.signal_handler) # set start time self.start_time = time.time() - self.wall_t print("Ready to start") for t in self.train_threads: t.start() print('Press Ctrl+C to stop')#, flush=True) signal.pause() def save(self, name=""): """ Save checkpoint. Called from thread-0. """ self.stop_requested = True # Wait for all other threads to stop print("Waiting for childs!")#, flush=True) for (i, t) in enumerate(self.train_threads): if i != 0: t.join() # Save try: if not os.path.exists(flags.checkpoint_dir): os.mkdir(flags.checkpoint_dir) # Write wall time wall_t = time.time() - self.start_time wall_t_fname = flags.checkpoint_dir + '/' + 'wall_t.' + str(self.global_t) #if not os.path.exists(wall_t_fname): # os.mkdir(wall_t_fname) with open(wall_t_fname, 'w') as f: f.write(str(wall_t)) ckpt_name = 'checkpoint' if name != "": ckpt_name = name ckpt_name += '-' + str(abs(self.best_score))[2:8] # Here may be bug print('Start saving.') self.saver.save(self.sess, flags.checkpoint_dir + '/' + ckpt_name, global_step = self.global_t) print('End saving.') self.stop_requested = False self.next_save_steps += flags.save_interval_step except Exception as e: self.terminate_requested = True ## Let it be here for debug save() function!!! print(traceback.format_exc())#, flush=True) raise Exception("Error in 'save' occured!") finally: # Restart other threads print("Restarting other threads!") for i in range(flags.parallel_size): if i != 0: thread = threading.Thread(target=self.train_function, args=(i,False)) self.train_threads[i] = thread thread.start() def signal_handler(self, signal, frame): print('You pressed Ctrl+C!')#, flush=True) self.terminate_requested = True def print_flags_info(self): return_string = "\n\n\n" return_string += "Envs FILE:{}\n".format(flags.env_name) return_string += "Checkpoint dir: {}, Termination time in sec: " \ "{}, Max steps to train: {:2.3E}, Parallel threads:{}\n".format(flags.checkpoint_dir, flags.termination_time_sec, flags.max_time_step, flags.parallel_size) return_string += "Use ErfNet Encoder-Decoder, N classes: {}\n".format(flags.n_classes) if flags.segnet >= 2 else "" return_string += "Use ErfNet Encoder only\n" if flags.segnet == 1 else "" return_string += "Use vanilla encoder\n" if flags.segnet == 0 else "" return_string += "Use PC:{}, Use VR:{}, use RP:{}\n".format(flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction) return_string += "Experience hist size: {}, Local_t max: {}, n-step-TD: {}\n".format(flags.experience_history_size, flags.local_t_max, flags.n_step_TD) return_string += "Entropy beta: {}, Gradient norm clipping: {}, Rmsprop alpha: {}, Saving step: {}\n".format( flags.entropy_beta, flags.grad_norm_clip, flags.rmsp_alpha, flags.save_interval_step) print(return_string)
from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf import matplotlib.pyplot as plt from environment.environment import Environment from model.model import UnrealModel from constants import * # use CPU for weight visualize tool device = "/cpu:0" action_size = Environment.get_action_size() global_network = UnrealModel(action_size, -1, device) sess = tf.Session() init = tf.global_variables_initializer() sess.run(init) saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR) if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old checkpoint") vars = {}
class Application(object): def __init__(self): pass def base_train_function(self): """ Train routine for base_trainer. """ trainer = self.base_trainer # set start_time trainer.set_start_time(self.start_time, self.global_t) while True: if self.stop_requested: break if self.terminate_requested: break if self.global_t > flags.max_time_step: break if self.global_t > self.next_save_steps: # Save checkpoint logger.debug("Steps:{}".format(self.global_t)) logger.debug(self.next_save_steps) self.save() diff_global_t = trainer.process(self.sess, self.global_t, self.summary_writer, self.summary_op, self.summary_values, flags.base_lambda) self.global_t += diff_global_t logger.warn("exiting training!") self.environment.stop() #sys.exit(0) time.sleep(1) os._exit(0) def aux_train_function(self, aux_index): """ Train routine for aux_trainer. """ trainer = self.aux_trainers[aux_index] while True: if self.global_t < 500: continue if self.stop_requested: continue if self.terminate_requested: break if self.global_t > flags.max_time_step: break diff_aux_t = trainer.process(self.sess, self.global_t, self.aux_t, self.summary_writer, self.summary_op_aux, self.summary_aux) self.aux_t += diff_aux_t #logger.debug("aux_t:{}".format(self.aux_t)) def run(self): device = "/cpu:0" if USE_GPU: device = "/gpu:0" logger.debug("start App") initial_learning_rate = flags.initial_learning_rate self.global_t = 0 self.aux_t = 0 self.stop_requested = False self.terminate_requested = False logger.debug("getting action size...") visinput = [flags.vision, flags.vis_h, flags.vis_w] action_size = Environment.get_action_size(flags.env_type, flags.env_name) # Setup Global Network logger.debug("loading global model...") self.global_network = UnrealModel( action_size, visinput, -1, flags.entropy_beta, device, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, flags.use_temporal_coherence, flags.pixel_change_lambda, flags.temporal_coherence_lambda) logger.debug("done loading global model") learning_rate_input = tf.placeholder("float") # Setup gradient calculator #""" grad_applier = RMSPropApplier( learning_rate=learning_rate_input, #decay = flags.rmsp_alpha, momentum=0.0, #epsilon = flags.rmsp_epsilon, clip_norm=flags.grad_norm_clip, device=device) """ grad_applier = AdamApplier(learning_rate = learning_rate_input, clip_norm=flags.grad_norm_clip, device=device) """ # Start environment self.environment = Environment.create_environment( flags.env_type, flags.env_name, visinput) logger.debug("done loading environment") # Setup runner self.runner = RunnerThread(flags, self.environment, self.global_network, action_size, visinput, device, visualise) logger.debug("done setting up RunnerTread") # Setup experience self.experience = Experience(flags.experience_history_size) #@TODO check device usage: should we build a cluster? # Setup Base Network self.base_trainer = BaseTrainer( self.runner, self.global_network, initial_learning_rate, learning_rate_input, grad_applier, visinput, flags.env_type, flags.env_name, flags.entropy_beta, flags.gamma, self.experience, flags.max_time_step, device) # Setup Aux Networks self.aux_trainers = [] for k in range(flags.parallel_size): self.aux_trainers.append( AuxTrainer( self.global_network, k + 2, #-1 is global, 0 is runnerthread, 1 is base flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, flags.use_temporal_coherence, flags.pixel_change_lambda, flags.temporal_coherence_lambda, flags.aux_initial_learning_rate, learning_rate_input, grad_applier, visinput, self.aux_t, flags.env_type, flags.env_name, flags.entropy_beta, flags.local_t_max, flags.gamma, flags.aux_lambda, flags.gamma_pc, self.experience, flags.max_time_step, device)) # Start tensorflow session config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) self.sess.run(tf.global_variables_initializer()) self.init_tensorboard() # init or load checkpoint with saver self.saver = tf.train.Saver(self.global_network.get_vars()) checkpoint = tf.train.get_checkpoint_state(flags.checkpoint_dir) if CONTINUE_TRAINING and checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) checkpointpath = checkpoint.model_checkpoint_path.replace( "/", "\\") logger.info("checkpoint loaded: {}".format(checkpointpath)) tokens = checkpoint.model_checkpoint_path.split("-") # set global step self.global_t = int(tokens[1]) logger.info(">>> global step set: {}".format(self.global_t)) logger.info(">>> aux step: {}".format(self.aux_t)) # set wall time wall_t_fname = flags.checkpoint_dir + '/' + 'wall_t.' + str( self.global_t) with open(wall_t_fname, 'r') as f: self.wall_t = float(f.read()) self.next_save_steps = ( self.global_t + flags.save_interval_step ) // flags.save_interval_step * flags.save_interval_step logger.debug("next save steps:{}".format(self.next_save_steps)) else: logger.info("Could not find old checkpoint") # set wall time self.wall_t = 0.0 self.next_save_steps = flags.save_interval_step signal.signal(signal.SIGINT, self.signal_handler) # set start time self.start_time = time.time() - self.wall_t # Start runner self.runner.start_runner(self.sess) # Start base_network thread self.base_train_thread = threading.Thread( target=self.base_train_function, args=()) self.base_train_thread.start() # Start aux_network threads self.aux_train_threads = [] for k in range(flags.parallel_size): self.aux_train_threads.append( threading.Thread(target=self.aux_train_function, args=(k, ))) self.aux_train_threads[k].start() logger.debug(threading.enumerate()) logger.info('Press Ctrl+C to stop') #signal.pause() def init_tensorboard(self): # tensorboard summary for base self.score_input = tf.placeholder(tf.int32) self.epl_input = tf.placeholder(tf.int32) self.policy_loss = tf.placeholder(tf.float32) self.value_loss = tf.placeholder(tf.float32) self.base_entropy = tf.placeholder(tf.float32) self.base_gradient = tf.placeholder(tf.float32) self.base_lr = tf.placeholder(tf.float32) self.laststate = tf.placeholder( tf.float32, [1, flags.vis_w, flags.vis_h, len(flags.vision)], name="laststate") score = tf.summary.scalar("env/score", self.score_input) epl = tf.summary.scalar("env/ep_length", self.epl_input) policy_loss = tf.summary.scalar("base/policy_loss", self.policy_loss) value_loss = tf.summary.scalar("base/value_loss", self.value_loss) entropy = tf.summary.scalar("base/entropy", self.base_entropy) gradient = tf.summary.scalar("base/gradient", self.base_gradient) lr = tf.summary.scalar("base/learning_rate", self.base_lr) laststate = tf.summary.image("base/laststate", self.laststate) self.summary_values = [ self.score_input, self.epl_input, self.policy_loss, self.value_loss, self.base_entropy, self.base_gradient, self.base_lr, self.laststate ] self.summary_op = tf.summary.merge_all( ) # we want to merge model histograms as well here # tensorboard summary for aux self.summary_aux = [] aux_losses = [] self.aux_basep_loss = tf.placeholder(tf.float32) self.aux_basev_loss = tf.placeholder(tf.float32) self.aux_entropy = tf.placeholder(tf.float32) self.aux_gradient = tf.placeholder(tf.float32) self.summary_aux.append(self.aux_basep_loss) self.summary_aux.append(self.aux_basev_loss) aux_losses.append( tf.summary.scalar("aux/basep_loss", self.aux_basep_loss)) aux_losses.append( tf.summary.scalar("aux/basev_loss", self.aux_basev_loss)) if flags.use_pixel_change: self.pc_loss = tf.placeholder(tf.float32) self.summary_aux.append(self.pc_loss) aux_losses.append(tf.summary.scalar("aux/pc_loss", self.pc_loss)) if flags.use_value_replay: self.vr_loss = tf.placeholder(tf.float32) self.summary_aux.append(self.vr_loss) aux_losses.append(tf.summary.scalar("aux/vr_loss", self.vr_loss)) if flags.use_reward_prediction: self.rp_loss = tf.placeholder(tf.float32) self.summary_aux.append(self.rp_loss) aux_losses.append(tf.summary.scalar("aux/rp_loss", self.rp_loss)) if flags.use_temporal_coherence: self.tc_loss = tf.placeholder(tf.float32) self.summary_aux.append(self.tc_loss) aux_losses.append(tf.summary.scalar("aux/tc_loss", self.tc_loss)) # append entropy and gradient last self.summary_aux.append(self.aux_entropy) self.summary_aux.append(self.aux_gradient) aux_losses.append(tf.summary.scalar("aux/entropy", self.aux_entropy)) aux_losses.append(tf.summary.scalar("aux/gradient", self.aux_gradient)) self.summary_op_aux = tf.summary.merge(aux_losses) #self.summary_op = tf.summary.merge_all() tensorboard_path = flags.temp_dir + TRAINING_NAME + "/" logger.info("tensorboard path:" + tensorboard_path) if not os.path.exists(tensorboard_path): os.makedirs(tensorboard_path) self.summary_writer = tf.summary.FileWriter(tensorboard_path) self.summary_writer.add_graph(self.sess.graph) def save(self): """ Save checkpoint. Called from base_trainer. """ self.stop_requested = True # Save if not os.path.exists(flags.checkpoint_dir): os.mkdir(flags.checkpoint_dir) # Write wall time wall_t = time.time() - self.start_time wall_t_fname = flags.checkpoint_dir + '/' + 'wall_t.' + str( self.global_t) with open(wall_t_fname, 'w') as f: f.write(str(wall_t)) logger.info('Start saving.') self.saver.save(self.sess, flags.checkpoint_dir + '/' + 'checkpoint', global_step=self.global_t) logger.info('End saving.') self.stop_requested = False self.next_save_steps += flags.save_interval_step def signal_handler(self, signal, frame): logger.warn('Ctrl+C detected, shutting down...') logger.info('run name: {} -- terminated'.format(TRAINING_NAME)) self.terminate_requested = True
def run(self): device = "/cpu:0" if USE_GPU: device = "/gpu:0" logger.debug("start App") initial_learning_rate = flags.initial_learning_rate self.global_t = 0 self.aux_t = 0 self.stop_requested = False self.terminate_requested = False logger.debug("getting action size...") visinput = [flags.vision, flags.vis_h, flags.vis_w] action_size = Environment.get_action_size(flags.env_type, flags.env_name) # Setup Global Network logger.debug("loading global model...") self.global_network = UnrealModel( action_size, visinput, -1, flags.entropy_beta, device, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, flags.use_temporal_coherence, flags.pixel_change_lambda, flags.temporal_coherence_lambda) logger.debug("done loading global model") learning_rate_input = tf.placeholder("float") # Setup gradient calculator #""" grad_applier = RMSPropApplier( learning_rate=learning_rate_input, #decay = flags.rmsp_alpha, momentum=0.0, #epsilon = flags.rmsp_epsilon, clip_norm=flags.grad_norm_clip, device=device) """ grad_applier = AdamApplier(learning_rate = learning_rate_input, clip_norm=flags.grad_norm_clip, device=device) """ # Start environment self.environment = Environment.create_environment( flags.env_type, flags.env_name, visinput) logger.debug("done loading environment") # Setup runner self.runner = RunnerThread(flags, self.environment, self.global_network, action_size, visinput, device, visualise) logger.debug("done setting up RunnerTread") # Setup experience self.experience = Experience(flags.experience_history_size) #@TODO check device usage: should we build a cluster? # Setup Base Network self.base_trainer = BaseTrainer( self.runner, self.global_network, initial_learning_rate, learning_rate_input, grad_applier, visinput, flags.env_type, flags.env_name, flags.entropy_beta, flags.gamma, self.experience, flags.max_time_step, device) # Setup Aux Networks self.aux_trainers = [] for k in range(flags.parallel_size): self.aux_trainers.append( AuxTrainer( self.global_network, k + 2, #-1 is global, 0 is runnerthread, 1 is base flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, flags.use_temporal_coherence, flags.pixel_change_lambda, flags.temporal_coherence_lambda, flags.aux_initial_learning_rate, learning_rate_input, grad_applier, visinput, self.aux_t, flags.env_type, flags.env_name, flags.entropy_beta, flags.local_t_max, flags.gamma, flags.aux_lambda, flags.gamma_pc, self.experience, flags.max_time_step, device)) # Start tensorflow session config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) self.sess.run(tf.global_variables_initializer()) self.init_tensorboard() # init or load checkpoint with saver self.saver = tf.train.Saver(self.global_network.get_vars()) checkpoint = tf.train.get_checkpoint_state(flags.checkpoint_dir) if CONTINUE_TRAINING and checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) checkpointpath = checkpoint.model_checkpoint_path.replace( "/", "\\") logger.info("checkpoint loaded: {}".format(checkpointpath)) tokens = checkpoint.model_checkpoint_path.split("-") # set global step self.global_t = int(tokens[1]) logger.info(">>> global step set: {}".format(self.global_t)) logger.info(">>> aux step: {}".format(self.aux_t)) # set wall time wall_t_fname = flags.checkpoint_dir + '/' + 'wall_t.' + str( self.global_t) with open(wall_t_fname, 'r') as f: self.wall_t = float(f.read()) self.next_save_steps = ( self.global_t + flags.save_interval_step ) // flags.save_interval_step * flags.save_interval_step logger.debug("next save steps:{}".format(self.next_save_steps)) else: logger.info("Could not find old checkpoint") # set wall time self.wall_t = 0.0 self.next_save_steps = flags.save_interval_step signal.signal(signal.SIGINT, self.signal_handler) # set start time self.start_time = time.time() - self.wall_t # Start runner self.runner.start_runner(self.sess) # Start base_network thread self.base_train_thread = threading.Thread( target=self.base_train_function, args=()) self.base_train_thread.start() # Start aux_network threads self.aux_train_threads = [] for k in range(flags.parallel_size): self.aux_train_threads.append( threading.Thread(target=self.aux_train_function, args=(k, ))) self.aux_train_threads[k].start() logger.debug(threading.enumerate()) logger.info('Press Ctrl+C to stop')
class Display(object): def __init__(self, display_size): pygame.init() self.surface = pygame.display.set_mode(display_size, 0, 24) pygame.display.set_caption('UNREAL') self.action_size = Environment.get_action_size(flags.env_type, flags.env_name) self.objective_size = Environment.get_objective_size( flags.env_type, flags.env_name) self.global_network = UnrealModel(self.action_size, self.objective_size, -1, flags.use_lstm, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, 0.0, 0.0, "/cpu:0", for_display=True) self.environment = Environment.create_environment( flags.env_type, flags.env_name, env_args={ 'episode_schedule': flags.split, 'log_action_trace': flags.log_action_trace, 'max_states_per_scene': flags.episodes_per_scene, 'episodes_per_scene_test': flags.episodes_per_scene }) self.font = pygame.font.SysFont(None, 20) self.value_history = ValueHistory() self.state_history = StateHistory() self.episode_reward = 0 def update(self, sess): self.surface.fill(BLACK) self.process(sess) pygame.display.update() def choose_action(self, pi_values): return np.random.choice(range(len(pi_values)), p=pi_values) def scale_image(self, image, scale): return image.repeat(scale, axis=0).repeat(scale, axis=1) def draw_text(self, str, left, top, color=WHITE): text = self.font.render(str, True, color, BLACK) text_rect = text.get_rect() text_rect.left = left text_rect.top = top self.surface.blit(text, text_rect) def draw_center_text(self, str, center_x, top): text = self.font.render(str, True, WHITE, BLACK) text_rect = text.get_rect() text_rect.centerx = center_x text_rect.top = top self.surface.blit(text, text_rect) def show_pixel_change(self, pixel_change, left, top, rate, label): """ Show pixel change """ pixel_change_ = np.clip(pixel_change * 255.0 * rate, 0.0, 255.0) data = pixel_change_.astype(np.uint8) data = np.stack([data for _ in range(3)], axis=2) data = self.scale_image(data, 4) image = pygame.image.frombuffer(data, (20 * 4, 20 * 4), 'RGB') self.surface.blit(image, (left + 8 + 4, top + 8 + 4)) self.draw_center_text(label, left + 100 / 2, top + 100) def show_policy(self, pi): """ Show action probability. """ start_x = 10 y = 150 for i in range(len(pi)): width = pi[i] * 100 pygame.draw.rect(self.surface, WHITE, (start_x, y, width, 10)) y += 20 self.draw_center_text("PI", 50, y) def show_image(self, state): """ Show input image """ state_ = state * 255.0 data = state_.astype(np.uint8) image = pygame.image.frombuffer(data, (84, 84), 'RGB') self.surface.blit(image, (8, 8)) self.draw_center_text("input", 50, 100) def show_value(self): if self.value_history.is_empty: return min_v = float("inf") max_v = float("-inf") values = self.value_history.values for v in values: min_v = min(min_v, v) max_v = max(max_v, v) top = 150 left = 150 width = 100 height = 100 bottom = top + width right = left + height d = max_v - min_v last_r = 0.0 for i, v in enumerate(values): r = (v - min_v) / d if i > 0: x0 = i - 1 + left x1 = i + left y0 = bottom - last_r * height y1 = bottom - r * height pygame.draw.line(self.surface, BLUE, (x0, y0), (x1, y1), 1) last_r = r pygame.draw.line(self.surface, WHITE, (left, top), (left, bottom), 1) pygame.draw.line(self.surface, WHITE, (right, top), (right, bottom), 1) pygame.draw.line(self.surface, WHITE, (left, top), (right, top), 1) pygame.draw.line(self.surface, WHITE, (left, bottom), (right, bottom), 1) self.draw_center_text("V", left + width / 2, bottom + 10) def show_reward_prediction(self, rp_c, reward): start_x = 310 reward_index = 0 if reward == 0: reward_index = 0 elif reward > 0: reward_index = 1 elif reward < 0: reward_index = 2 y = 150 labels = ["0", "+", "-"] for i in range(len(rp_c)): width = rp_c[i] * 100 if i == reward_index: color = RED else: color = WHITE pygame.draw.rect(self.surface, color, (start_x + 15, y, width, 10)) self.draw_text(labels[i], start_x, y - 1, color) y += 20 self.draw_center_text("RP", start_x + 100 / 2, y) def show_reward(self): self.draw_text("REWARD: {}".format(int(self.episode_reward)), 310, 10) def process(self, sess): last_action = self.environment.last_action last_reward = np.clip(self.environment.last_reward, -1, 1) last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward, self.environment.last_state) if not flags.use_pixel_change: pi_values, v_value = self.global_network.run_base_policy_and_value( sess, self.environment.last_state, last_action_reward) else: pi_values, v_value, pc_q = self.global_network.run_base_policy_value_pc_q( sess, self.environment.last_state, last_action_reward) self.value_history.add_value(v_value) action = self.choose_action(pi_values) state, reward, terminal, pixel_change = self.environment.process( action) self.episode_reward += reward if terminal: self.environment.reset() self.episode_reward = 0 self.show_image(state['image']) self.show_policy(pi_values) self.show_value() self.show_reward() if flags.use_pixel_change: self.show_pixel_change(pixel_change, 100, 0, 3.0, "PC") self.show_pixel_change(pc_q[:, :, action], 200, 0, 0.4, "PC Q") if flags.use_reward_prediction: if self.state_history.is_full: rp_c = self.global_network.run_rp_c(sess, self.state_history.states) self.show_reward_prediction(rp_c, reward) self.state_history.add_state(state) def get_frame(self): data = self.surface.get_buffer().raw return data
class Trainer(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, env_type, env_name, use_pixel_change, use_value_replay, use_reward_prediction, use_future_reward_prediction, use_autoencoder, reward_length, pixel_change_lambda, entropy_beta, local_t_max, gamma, gamma_pc, experience_history_size, max_global_time_step, device, log_file, skip_step): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.env_type = env_type self.env_name = env_name self.use_pixel_change = use_pixel_change self.use_value_replay = use_value_replay self.use_reward_prediction = use_reward_prediction self.use_future_reward_prediction = use_future_reward_prediction self.use_autoencoder = use_autoencoder self.local_t_max = local_t_max self.gamma = gamma self.gamma_pc = gamma_pc self.experience_history_size = experience_history_size self.max_global_time_step = max_global_time_step self.skip_step = skip_step self.action_size = Environment.get_action_size(env_type, env_name) self.local_network = UnrealModel(self.action_size, thread_index, use_pixel_change, use_value_replay, use_reward_prediction, use_future_reward_prediction, use_autoencoder, pixel_change_lambda, entropy_beta, device) self.local_network.prepare_loss() self.apply_gradients = grad_applier.minimize_local( self.local_network.total_loss, global_network.get_vars(), self.local_network.get_vars()) self.sync = self.local_network.sync_from(global_network) self.experience = Experience(self.experience_history_size, reward_length) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # For log output self.prev_local_t = 0 self.log_file = log_file self.prediction_res_file = log_file + '/' + 'res.pkl' def prepare(self): self.environment = Environment.create_environment( self.env_type, self.env_name, self.skip_step) def stop(self): self.environment.stop() def add_summary(self, step, name, value, writer): summary = tf.Summary() summary_value = summary.value.add() summary_value.simple_value = float(value) summary_value.tag = name writer.add_summary(summary, step) def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * ( self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, pi_values): return np.random.choice(range(len(pi_values)), p=pi_values) def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t): summary_str = sess.run(summary_op, feed_dict={score_input: score}) summary_writer.add_summary(summary_str, global_t) summary_writer.flush() def set_start_time(self, start_time): self.start_time = start_time def _fill_experience(self, sess): """ Fill experience buffer until buffer is full. """ prev_state = self.environment.last_state last_action = self.environment.last_action last_reward = self.environment.last_reward last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward) pi_, _ = self.local_network.run_base_policy_and_value( sess, self.environment.last_state, last_action_reward) action = self.choose_action(pi_) new_state, reward, terminal, pixel_change = self.environment.process( action) frame = ExperienceFrame(prev_state, reward, action, terminal, pixel_change, last_action, last_reward) self.experience.add_frame(frame) if terminal: self.environment.reset() if self.experience.is_full(): self.environment.reset() print("Replay buffer filled") def _print_log(self, global_t): if (self.thread_index == 0) and (self.local_t - self.prev_local_t >= PERFORMANCE_LOG_INTERVAL): self.prev_local_t += PERFORMANCE_LOG_INTERVAL elapsed_time = time.time() - self.start_time steps_per_sec = global_t / elapsed_time print( "### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour" .format(global_t, elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.)) def _process_base(self, sess, global_t, summary_writer, summary_op, score_input): # [Base A3C] states = [] last_action_rewards = [] actions = [] rewards = [] values = [] terminal_end = False start_lstm_state = self.local_network.base_lstm_state_out # t_max times loop for _ in range(self.local_t_max): # Prepare last action reward last_action = self.environment.last_action last_reward = self.environment.last_reward last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward) pi_, value_ = self.local_network.run_base_policy_and_value( sess, self.environment.last_state, last_action_reward) action = self.choose_action(pi_) states.append(self.environment.last_state) last_action_rewards.append(last_action_reward) actions.append(action) values.append(value_) if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0): print("pi={}".format(pi_)) print(" V={}".format(value_)) prev_state = self.environment.last_state # Process game new_state, reward, terminal, pixel_change = self.environment.process( action) frame = ExperienceFrame(prev_state, reward, action, terminal, pixel_change, last_action, last_reward) # Store to experience self.experience.add_frame(frame) self.episode_reward += reward rewards.append(reward) self.local_t += 1 if terminal: terminal_end = True print("score={}".format(self.episode_reward)) self._record_score(sess, summary_writer, summary_op, score_input, self.episode_reward, global_t) self.episode_reward = 0 self.environment.reset() self.local_network.reset_state() break R = 0.0 if not terminal_end: R = self.local_network.run_base_value( sess, new_state, frame.get_action_reward(self.action_size)) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_a = [] batch_adv = [] batch_R = [] for (ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + self.gamma * R adv = R - Vi a = np.zeros([self.action_size]) a[ai] = 1.0 batch_si.append(si) batch_a.append(a) batch_adv.append(adv) batch_R.append(R) batch_si.reverse() batch_a.reverse() batch_adv.reverse() batch_R.reverse() return batch_si, last_action_rewards, batch_a, batch_adv, batch_R, start_lstm_state def _process_pc(self, sess): # [pixel change] # Sample 20+1 frame (+1 for last next state) pc_experience_frames = self.experience.sample_sequence( self.local_t_max + 1) # Reverse sequence to calculate from the last pc_experience_frames.reverse() batch_pc_si = [] batch_pc_a = [] batch_pc_R = [] batch_pc_last_action_reward = [] pc_R = np.zeros([20, 20], dtype=np.float32) if not pc_experience_frames[1].terminal: pc_R = self.local_network.run_pc_q_max( sess, pc_experience_frames[0].state, pc_experience_frames[0].get_last_action_reward( self.action_size)) for frame in pc_experience_frames[1:]: pc_R = frame.pixel_change + self.gamma_pc * pc_R a = np.zeros([self.action_size]) a[frame.action] = 1.0 last_action_reward = frame.get_last_action_reward(self.action_size) batch_pc_si.append(frame.state) batch_pc_a.append(a) batch_pc_R.append(pc_R) batch_pc_last_action_reward.append(last_action_reward) batch_pc_si.reverse() batch_pc_a.reverse() batch_pc_R.reverse() batch_pc_last_action_reward.reverse() return batch_pc_si, batch_pc_last_action_reward, batch_pc_a, batch_pc_R def _process_vr(self, sess): # [Value replay] # Sample 20+1 frame (+1 for last next state) vr_experience_frames = self.experience.sample_sequence( self.local_t_max + 1) # Reverse sequence to calculate from the last vr_experience_frames.reverse() batch_vr_si = [] batch_vr_R = [] batch_vr_last_action_reward = [] vr_R = 0.0 if not vr_experience_frames[1].terminal: vr_R = self.local_network.run_vr_value( sess, vr_experience_frames[0].state, vr_experience_frames[0].get_last_action_reward( self.action_size)) # t_max times loop for frame in vr_experience_frames[1:]: vr_R = frame.reward + self.gamma * vr_R batch_vr_si.append(frame.state) batch_vr_R.append(vr_R) last_action_reward = frame.get_last_action_reward(self.action_size) batch_vr_last_action_reward.append(last_action_reward) batch_vr_si.reverse() batch_vr_R.reverse() batch_vr_last_action_reward.reverse() return batch_vr_si, batch_vr_last_action_reward, batch_vr_R ''' def _process_rp(self): # [Reward prediction] rp_experience_frames, total_raw_reward, _ = self.experience.sample_rp_sequence() # 4 frames batch_rp_si = [] batch_rp_c = [] for i in range(4): batch_rp_si.append(rp_experience_frames[i].state) # one hot vector for target reward r = total_raw_reward rp_c = [0.0, 0.0, 0.0] if r == 0: rp_c[0] = 1.0 # zero elif r > 0: rp_c[1] = 1.0 # positive else: rp_c[2] = 1.0 # negative batch_rp_c.append(rp_c) return batch_rp_si, batch_rp_c ''' def _process_replay(self, action=False): # [Reward prediction] rp_experience_frames, total_raw_reward, next_frame = self.experience.sample_rp_sequence( flag=True) # 4 frames batch_rp_si = [] batch_rp_c = [] for i in range(4): batch_rp_si.append(rp_experience_frames[i].state) # one hot vector for target reward r = total_raw_reward rp_c = [0.0, 0.0, 0.0] if r == 0: rp_c[0] = 1.0 # zero elif r > 0: rp_c[1] = 1.0 # positive else: rp_c[2] = 1.0 # negative batch_rp_c.append(rp_c) result = [batch_rp_si, batch_rp_c, next_frame] if action: batch_rp_action = [] action_index = rp_experience_frames[3].action action_one_hot = np.zeros([self.action_size]) action_one_hot[action_index] = 1.0 batch_rp_action.append(action_one_hot) result.append(batch_rp_action) return result def process(self, sess, global_t, summary_writer, summary_op, score_input): # Fill experience replay buffer if not self.experience.is_full(): self._fill_experience(sess) return 0 start_local_t = self.local_t cur_learning_rate = self._anneal_learning_rate(global_t) # Copy weights from shared to local sess.run(self.sync) # [Base] batch_si, batch_last_action_rewards, batch_a, batch_adv, batch_R, start_lstm_state = \ self._process_base(sess, global_t, summary_writer, summary_op, score_input) feed_dict = { self.local_network.base_input: batch_si, self.local_network.base_last_action_reward_input: batch_last_action_rewards, self.local_network.base_a: batch_a, self.local_network.base_adv: batch_adv, self.local_network.base_r: batch_R, self.local_network.base_initial_lstm_state: start_lstm_state, # [common] self.learning_rate_input: cur_learning_rate } # [Pixel change] if self.use_pixel_change: batch_pc_si, batch_pc_last_action_reward, batch_pc_a, batch_pc_R = self._process_pc( sess) pc_feed_dict = { self.local_network.pc_input: batch_pc_si, self.local_network.pc_last_action_reward_input: batch_pc_last_action_reward, self.local_network.pc_a: batch_pc_a, self.local_network.pc_r: batch_pc_R } feed_dict.update(pc_feed_dict) # [Value replay] if self.use_value_replay: batch_vr_si, batch_vr_last_action_reward, batch_vr_R = self._process_vr( sess) vr_feed_dict = { self.local_network.vr_input: batch_vr_si, self.local_network.vr_last_action_reward_input: batch_vr_last_action_reward, self.local_network.vr_r: batch_vr_R } feed_dict.update(vr_feed_dict) # [Reward prediction] next_frame = None if self.use_reward_prediction: batch_rp_si, batch_rp_c, next_frame = self._process_replay() rp_feed_dict = { self.local_network.rp_input: batch_rp_si, self.local_network.rp_c_target: batch_rp_c } feed_dict.update(rp_feed_dict) # [Future reward prediction] if self.use_future_reward_prediction: batch_frp_si, batch_frp_c, next_frame, batch_frp_action = self._process_replay( action=True) frp_feed_dict = { self.local_network.frp_input: batch_frp_si, self.local_network.frp_c_target: batch_frp_c, self.local_network.frp_action_input: batch_frp_action } feed_dict.update(frp_feed_dict) if next_frame and self.use_autoencoder: ae_feed_dict = { self.local_network.ground_truth: np.expand_dims(next_frame.state, axis=0) } feed_dict.update(ae_feed_dict) # Calculate gradients and copy them to global network. #sess.run( self.apply_gradients, feed_dict=feed_dict) ln = self.local_network if self.use_future_reward_prediction: if self.use_autoencoder: frp_c, decoder_loss, frp_loss, value_loss, policy_loss, _ = sess.run( [ ln.frp_c, ln.decoder_loss, ln.frp_loss, ln.value_loss, ln.policy_loss, self.apply_gradients ], feed_dict=feed_dict) self.add_summary(global_t, 'decoder_loss', decoder_loss, summary_writer) self.add_summary(global_t, 'frp_loss', frp_loss, summary_writer) else: frp_c, value_loss, policy_loss, _ = sess.run( [ ln.frp_c, ln.value_loss, ln.policy_loss, self.apply_gradients ], feed_dict=feed_dict) acc = ((frp_c == frp_c.max()) * batch_frp_c).sum() self.add_summary(global_t, 'reward prediction accuracy', acc, summary_writer) else: value_loss, policy_loss, _ = sess.run( [ln.value_loss, ln.policy_loss, self.apply_gradients], feed_dict=feed_dict) self.add_summary(global_t, 'value_loss', value_loss, summary_writer) self.add_summary(global_t, 'policy_loss', policy_loss, summary_writer) self.add_summary(global_t, 'base_loss', policy_loss + value_loss, summary_writer) if self.use_autoencoder and global_t % 25000 == 0: current_res = { 'next_frame_ground_truth': next_frame, 'step': global_t } if self.use_reward_prediction: predicted_frame, predicted_reward = sess.run( [ self.local_network.encoder_output, self.local_network.rp_c ], feed_dict=feed_dict) current_res['states'] = batch_rp_si current_res['target_reward'] = batch_rp_c elif self.use_future_reward_prediction: predicted_frame, predicted_reward = sess.run( [ self.local_network.encoder_output, self.local_network.frp_c ], feed_dict=feed_dict) current_res['states'] = batch_frp_si current_res['target_reward'] = batch_frp_c current_res['action'] = batch_frp_action current_res['next_frame_prediction'] = predicted_frame current_res['next_reward_prediction'] = predicted_reward if os.path.exists(self.prediction_res_file) and os.path.getsize( self.prediction_res_file) > 0: with open(self.prediction_res_file, 'rb') as f: res = pickle.load(f) else: res = [] res.append(current_res) with open(self.prediction_res_file, 'wb') as f: pickle.dump(res, f) self._print_log(global_t) # Return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t
class Trainer(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, env_type, env_name, use_pixel_change, use_value_replay, use_reward_prediction, pixel_change_lambda, entropy_beta, local_t_max, gamma, gamma_pc, experience_history_size, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.env_type = env_type self.env_name = env_name self.use_pixel_change = use_pixel_change self.use_value_replay = use_value_replay self.use_reward_prediction = use_reward_prediction self.local_t_max = local_t_max self.gamma = gamma self.gamma_pc = gamma_pc self.experience_history_size = experience_history_size self.max_global_time_step = max_global_time_step self.action_size = Environment.get_action_size(env_type, env_name) self.local_network = UnrealModel(self.action_size, thread_index, use_pixel_change, use_value_replay, use_reward_prediction, pixel_change_lambda, entropy_beta, device) self.local_network.prepare_loss() self.apply_gradients = grad_applier.minimize_local( self.local_network.total_loss, global_network.get_vars(), self.local_network.get_vars()) self.sync = self.local_network.sync_from(global_network) self.experience = Experience(self.experience_history_size) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # For log output self.prev_local_t = 0 def prepare(self): self.environment = Environment.create_environment( self.env_type, self.env_name) def stop(self): self.environment.stop() def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * ( self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, pi_values): return np.random.choice(range(len(pi_values)), p=pi_values) def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t): summary_str = sess.run(summary_op, feed_dict={score_input: score}) summary_writer.add_summary(summary_str, global_t) summary_writer.flush() def set_start_time(self, start_time): self.start_time = start_time def _fill_experience(self, sess): """ Fill experience buffer until buffer is full. """ prev_state = self.environment.last_state last_action = self.environment.last_action last_reward = self.environment.last_reward last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward) pi_, _ = self.local_network.run_base_policy_and_value( sess, self.environment.last_state, last_action_reward) action = self.choose_action(pi_) new_state, reward, terminal, pixel_change = self.environment.process( action) frame = ExperienceFrame(prev_state, reward, action, terminal, pixel_change, last_action, last_reward) self.experience.add_frame(frame) if terminal: self.environment.reset() if self.experience.is_full(): self.environment.reset() print("Replay buffer filled") def _print_log(self, global_t): if (self.thread_index == 0) and (self.local_t - self.prev_local_t >= PERFORMANCE_LOG_INTERVAL): self.prev_local_t += PERFORMANCE_LOG_INTERVAL elapsed_time = time.time() - self.start_time steps_per_sec = global_t / elapsed_time print( "### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour" .format(global_t, elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.)) def _process_base(self, sess, global_t, summary_writer, summary_op, score_input): # [Base A3C] states = [] last_action_rewards = [] actions = [] rewards = [] values = [] terminal_end = False start_lstm_state = self.local_network.base_lstm_state_out # t_max times loop for _ in range(self.local_t_max): # Prepare last action reward last_action = self.environment.last_action last_reward = self.environment.last_reward last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward) #Modify Last State - with attention pi_, value_ = self.local_network.run_base_policy_and_value( sess, self.environment.last_state, last_action_reward) action = self.choose_action(pi_) states.append(self.environment.last_state) last_action_rewards.append(last_action_reward) actions.append(action) values.append(value_) if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0): print("pi={}".format(pi_)) print(" V={}".format(value_)) prev_state = self.environment.last_state # Process game new_state, reward, terminal, pixel_change = self.environment.process( action) #Modify New State - with attention frame = ExperienceFrame(prev_state, reward, action, terminal, pixel_change, last_action, last_reward) # Store to experience self.experience.add_frame(frame) self.episode_reward += reward rewards.append(reward) self.local_t += 1 if terminal: terminal_end = True print("score={}".format(self.episode_reward)) self._record_score(sess, summary_writer, summary_op, score_input, self.episode_reward, global_t) self.episode_reward = 0 self.environment.reset() self.local_network.reset_state() break R = 0.0 if not terminal_end: R = self.local_network.run_base_value( sess, new_state, frame.get_action_reward(self.action_size)) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_a = [] batch_adv = [] batch_R = [] for (ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + self.gamma * R adv = R - Vi a = np.zeros([self.action_size]) a[ai] = 1.0 batch_si.append(si) batch_a.append(a) batch_adv.append(adv) batch_R.append(R) batch_si.reverse() batch_a.reverse() batch_adv.reverse() batch_R.reverse() return batch_si, last_action_rewards, batch_a, batch_adv, batch_R, start_lstm_state def _process_pc(self, sess): # [pixel change] # Sample 20+1 frame (+1 for last next state) pc_experience_frames = self.experience.sample_sequence( self.local_t_max + 1) # Reverse sequence to calculate from the last pc_experience_frames.reverse() batch_pc_si = [] batch_pc_a = [] batch_pc_R = [] batch_pc_last_action_reward = [] pc_R = np.zeros([20, 20], dtype=np.float32) if not pc_experience_frames[1].terminal: pc_R = self.local_network.run_pc_q_max( sess, pc_experience_frames[0].state, pc_experience_frames[0].get_last_action_reward( self.action_size)) for frame in pc_experience_frames[1:]: pc_R = frame.pixel_change + self.gamma_pc * pc_R a = np.zeros([self.action_size]) a[frame.action] = 1.0 last_action_reward = frame.get_last_action_reward(self.action_size) batch_pc_si.append(frame.state) batch_pc_a.append(a) batch_pc_R.append(pc_R) batch_pc_last_action_reward.append(last_action_reward) batch_pc_si.reverse() batch_pc_a.reverse() batch_pc_R.reverse() batch_pc_last_action_reward.reverse() return batch_pc_si, batch_pc_last_action_reward, batch_pc_a, batch_pc_R def _process_vr(self, sess): # [Value replay] # Sample 20+1 frame (+1 for last next state) vr_experience_frames = self.experience.sample_sequence( self.local_t_max + 1) # Reverse sequence to calculate from the last vr_experience_frames.reverse() batch_vr_si = [] batch_vr_R = [] batch_vr_last_action_reward = [] vr_R = 0.0 if not vr_experience_frames[1].terminal: vr_R = self.local_network.run_vr_value( sess, vr_experience_frames[0].state, vr_experience_frames[0].get_last_action_reward( self.action_size)) # t_max times loop for frame in vr_experience_frames[1:]: vr_R = frame.reward + self.gamma * vr_R batch_vr_si.append(frame.state) batch_vr_R.append(vr_R) last_action_reward = frame.get_last_action_reward(self.action_size) batch_vr_last_action_reward.append(last_action_reward) batch_vr_si.reverse() batch_vr_R.reverse() batch_vr_last_action_reward.reverse() return batch_vr_si, batch_vr_last_action_reward, batch_vr_R def _process_rp(self): # [Reward prediction] rp_experience_frames = self.experience.sample_rp_sequence() # 4 frames batch_rp_si = [] batch_rp_c = [] for i in range(3): batch_rp_si.append(rp_experience_frames[i].state) # one hot vector for target reward r = rp_experience_frames[3].reward rp_c = [0.0, 0.0, 0.0] if r == 0: rp_c[0] = 1.0 # zero elif r > 0: rp_c[1] = 1.0 # positive else: rp_c[2] = 1.0 # negative batch_rp_c.append(rp_c) return batch_rp_si, batch_rp_c def process(self, sess, global_t, summary_writer, summary_op, score_input): # Fill experience replay buffer if not self.experience.is_full(): self._fill_experience(sess) return 0 start_local_t = self.local_t cur_learning_rate = self._anneal_learning_rate(global_t) # Copy weights from shared to local sess.run(self.sync) # [Base] batch_si, batch_last_action_rewards, batch_a, batch_adv, batch_R, start_lstm_state = \ self._process_base(sess, global_t, summary_writer, summary_op, score_input) feed_dict = { self.local_network.base_input: batch_si, self.local_network.base_last_action_reward_input: batch_last_action_rewards, self.local_network.base_a: batch_a, self.local_network.base_adv: batch_adv, self.local_network.base_r: batch_R, self.local_network.base_initial_lstm_state: start_lstm_state, # [common] self.learning_rate_input: cur_learning_rate } # [Pixel change] if self.use_pixel_change: batch_pc_si, batch_pc_last_action_reward, batch_pc_a, batch_pc_R = self._process_pc( sess) pc_feed_dict = { self.local_network.pc_input: batch_pc_si, self.local_network.pc_last_action_reward_input: batch_pc_last_action_reward, self.local_network.pc_a: batch_pc_a, self.local_network.pc_r: batch_pc_R } feed_dict.update(pc_feed_dict) # [Value replay] if self.use_value_replay: batch_vr_si, batch_vr_last_action_reward, batch_vr_R = self._process_vr( sess) vr_feed_dict = { self.local_network.vr_input: batch_vr_si, self.local_network.vr_last_action_reward_input: batch_vr_last_action_reward, self.local_network.vr_r: batch_vr_R } feed_dict.update(vr_feed_dict) # [Reward prediction] if self.use_reward_prediction: batch_rp_si, batch_rp_c = self._process_rp() rp_feed_dict = { self.local_network.rp_input: batch_rp_si, self.local_network.rp_c_target: batch_rp_c } feed_dict.update(rp_feed_dict) # Calculate gradients and copy them to global network. sess.run(self.apply_gradients, feed_dict=feed_dict) self._print_log(global_t) # Return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t
class Application(object): def __init__(self): pass def train_function(self, parallel_index, preparing): """ Train each environment. """ trainer = self.trainers[parallel_index] if preparing: trainer.prepare() # set start_time trainer.set_start_time(self.start_time) while True: if self.stop_requested: break if self.terminate_reqested: trainer.stop() break if self.global_t > flags.max_time_step: trainer.stop() break if parallel_index == 0 and self.global_t > self.next_save_steps: # Save checkpoint self.save() #Each env calls its own process #Process has sub tasks called within diff_global_t = trainer.process(self.sess, self.global_t, self.summary_writer, self.summary_op, self.score_input) self.global_t += diff_global_t def run(self): device = "/cpu:0" if USE_GPU: device = "/gpu:0" initial_learning_rate = log_uniform(flags.initial_alpha_low, flags.initial_alpha_high, flags.initial_alpha_log_rate) self.global_t = 0 self.stop_requested = False self.terminate_reqested = False action_size = Environment.get_action_size(flags.env_type, flags.env_name) self.global_network = UnrealModel(action_size, -1, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, flags.pixel_change_lambda, flags.entropy_beta, device) self.trainers = [] learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate=learning_rate_input, decay=flags.rmsp_alpha, momentum=0.0, epsilon=flags.rmsp_epsilon, clip_norm=flags.grad_norm_clip, device=device) for i in range( flags.parallel_size): #Trainer creates a UnrealModel in init trainer = Trainer(i, self.global_network, initial_learning_rate, learning_rate_input, grad_applier, flags.env_type, flags.env_name, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, flags.pixel_change_lambda, flags.entropy_beta, flags.local_t_max, flags.gamma, flags.gamma_pc, flags.experience_history_size, flags.max_time_step, device) self.trainers.append(trainer) # prepare session config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) self.sess.run(tf.global_variables_initializer()) # summary for tensorboard self.score_input = tf.placeholder(tf.int32) tf.summary.scalar("score", self.score_input) self.summary_op = tf.summary.merge_all() self.summary_writer = tf.summary.FileWriter(flags.log_file, self.sess.graph) # init or load checkpoint with saver self.saver = tf.train.Saver(self.global_network.get_vars()) checkpoint = tf.train.get_checkpoint_state(flags.checkpoint_dir) if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) tokens = checkpoint.model_checkpoint_path.split("-") # set global step self.global_t = int(tokens[1]) print(">>> global step set: ", self.global_t) # set wall time wall_t_fname = flags.checkpoint_dir + '/' + 'wall_t.' + str( self.global_t) with open(wall_t_fname, 'r') as f: self.wall_t = float(f.read()) self.next_save_steps = ( self.global_t + flags.save_interval_step ) // flags.save_interval_step * flags.save_interval_step else: print("Could not find old checkpoint") # set wall time self.wall_t = 0.0 self.next_save_steps = flags.save_interval_step # run training threads ## Each Env is Running Here Parallel self.train_threads = [] for i in range(flags.parallel_size): self.train_threads.append( threading.Thread(target=self.train_function, args=(i, True))) signal.signal(signal.SIGINT, self.signal_handler) # set start time self.start_time = time.time() - self.wall_t for t in self.train_threads: t.start() print('Press Ctrl+C to stop') signal.pause() def save(self): """ Save checkpoint. Called from therad-0. """ self.stop_requested = True # Wait for all other threads to stop for (i, t) in enumerate(self.train_threads): if i != 0: t.join() # Save if not os.path.exists(flags.checkpoint_dir): os.mkdir(flags.checkpoint_dir) # Write wall time wall_t = time.time() - self.start_time wall_t_fname = flags.checkpoint_dir + '/' + 'wall_t.' + str( self.global_t) with open(wall_t_fname, 'w') as f: f.write(str(wall_t)) print('Start saving.') self.saver.save(self.sess, flags.checkpoint_dir + '/' + 'checkpoint', global_step=self.global_t) print('End saving.') self.stop_requested = False self.next_save_steps += flags.save_interval_step # Restart other threads for i in range(flags.parallel_size): if i != 0: thread = threading.Thread(target=self.train_function, args=(i, False)) self.train_threads[i] = thread thread.start() def signal_handler(self, signal, frame): print('You pressed Ctrl+C!') self.terminate_reqested = True
def run(self): device = "/cpu:0" if USE_GPU: device = "/gpu:0" initial_learning_rate = log_uniform(flags.initial_alpha_low, flags.initial_alpha_high, flags.initial_alpha_log_rate) self.global_t = 0 self.stop_requested = False self.terminate_reqested = False action_size = Environment.get_action_size(flags.env_type, flags.env_name) self.global_network = UnrealModel(action_size, -1, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, flags.pixel_change_lambda, flags.entropy_beta, device) self.trainers = [] learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate=learning_rate_input, decay=flags.rmsp_alpha, momentum=0.0, epsilon=flags.rmsp_epsilon, clip_norm=flags.grad_norm_clip, device=device) for i in range( flags.parallel_size): #Trainer creates a UnrealModel in init trainer = Trainer(i, self.global_network, initial_learning_rate, learning_rate_input, grad_applier, flags.env_type, flags.env_name, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, flags.pixel_change_lambda, flags.entropy_beta, flags.local_t_max, flags.gamma, flags.gamma_pc, flags.experience_history_size, flags.max_time_step, device) self.trainers.append(trainer) # prepare session config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) self.sess.run(tf.global_variables_initializer()) # summary for tensorboard self.score_input = tf.placeholder(tf.int32) tf.summary.scalar("score", self.score_input) self.summary_op = tf.summary.merge_all() self.summary_writer = tf.summary.FileWriter(flags.log_file, self.sess.graph) # init or load checkpoint with saver self.saver = tf.train.Saver(self.global_network.get_vars()) checkpoint = tf.train.get_checkpoint_state(flags.checkpoint_dir) if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) tokens = checkpoint.model_checkpoint_path.split("-") # set global step self.global_t = int(tokens[1]) print(">>> global step set: ", self.global_t) # set wall time wall_t_fname = flags.checkpoint_dir + '/' + 'wall_t.' + str( self.global_t) with open(wall_t_fname, 'r') as f: self.wall_t = float(f.read()) self.next_save_steps = ( self.global_t + flags.save_interval_step ) // flags.save_interval_step * flags.save_interval_step else: print("Could not find old checkpoint") # set wall time self.wall_t = 0.0 self.next_save_steps = flags.save_interval_step # run training threads ## Each Env is Running Here Parallel self.train_threads = [] for i in range(flags.parallel_size): self.train_threads.append( threading.Thread(target=self.train_function, args=(i, True))) signal.signal(signal.SIGINT, self.signal_handler) # set start time self.start_time = time.time() - self.wall_t for t in self.train_threads: t.start() print('Press Ctrl+C to stop') signal.pause()
class Evaluate(object): def __init__(self): self.action_size = Environment.get_action_size(flags.env_type, flags.env_name) self.objective_size = Environment.get_objective_size(flags.env_type, flags.env_name) env_config = sim_config.get(flags.env_name) self.image_shape = [env_config['height'], env_config['width']] segnet_param_dict = {'segnet_mode': flags.segnet} is_training = tf.placeholder(tf.bool, name="training") # for display param in UnrealModel says its value self.global_network = UnrealModel(self.action_size, self.objective_size, -1, flags.use_lstm, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, 0.0, #flags.pixel_change_lambda 0.0, #flags.entropy_beta device, segnet_param_dict=segnet_param_dict, image_shape=self.image_shape, is_training=is_training, n_classes=flags.n_classes, segnet_lambda=flags.segnet_lambda, dropout=flags.dropout, for_display=True) self.environment = Environment.create_environment(flags.env_type, flags.env_name, flags.termination_time_sec, env_args={'episode_schedule': flags.split, 'log_action_trace': flags.log_action_trace, 'max_states_per_scene': flags.episodes_per_scene, 'episodes_per_scene_test': flags.episodes_per_scene}) self.global_network.prepare_loss() self.total_loss = [] self.segm_loss = [] self.episode_reward = [0] self.episode_roomtype = [] self.roomType_dict = {} self.segnet_class_dict = {} self.success_rate = [] self.batch_size = 20 self.batch_cur_num = 0 self.batch_prev_num = 0 self.batch_si = [] self.batch_sobjT = [] self.batch_a = [] self.batch_reward = [] def update(self, sess): self.process(sess) def is_done(self): return self.environment.is_all_scheduled_episodes_done() def choose_action(self, pi_values): return np.random.choice(range(len(pi_values)), p=pi_values) def process(self, sess): last_action = self.environment.last_action last_reward = self.environment.last_reward last_action_reward = ExperienceFrame.concat_action_and_reward(last_action, self.action_size, last_reward, self.environment.last_state) if random_policy: pi_values = [1/3.0, 1/3.0, 1/3.0] action = self.choose_action(pi_values) state, reward, terminal, pixel_change = self.environment.process(action) self.episode_reward[-1] += reward else: mode = "segnet" if flags.segnet >= 2 else "" segnet_preds = None if not flags.use_pixel_change: pi_values, v_value, segnet_preds = self.global_network.run_base_policy_and_value(sess, self.environment.last_state, last_action_reward, mode=mode) else: pi_values, v_value, pc_q = self.global_network.run_base_policy_value_pc_q(sess, self.environment.last_state, last_action_reward) if segnet_preds is not None: mask = self.environment.last_state.get('objectType', None) if mask is not None: new_classes = np.unique(mask) if segnet_preds.shape != mask.shape: print("Predictions have shape {}, but groundtruth mask has shape {}".format(segnet_preds.shape, mask.shape)) else: similar = segnet_preds == mask for id_class in new_classes: id_list = self.segnet_class_dict.get(id_class, None) if id_list is None: id_list = [] id_list += [[np.sum(similar[mask == id_class]), np.sum(mask == id_class)]] self.segnet_class_dict[id_class] = id_list self.batch_cur_num += 1 if flags.segnet == -1: #just not necessary if self.batch_cur_num != 0 and self.batch_cur_num - self.batch_prev_num >= self.batch_size: #print(np.unique(self.batch_sobjT)) feed_dict = {self.global_network.base_input: self.batch_si, self.global_network.base_segm_mask: self.batch_sobjT, self.global_network.is_training: not True} segm_loss, preds, confusion_mtx = sess.run([self.global_network.decoder_loss, self.global_network.preds, self.global_network.update_evaluation_vars], feed_dict=feed_dict) total_loss = 0 self.total_loss += [total_loss] self.segm_loss += [segm_loss] # TODO: here do something with it, store somwhere? #update every_thing else self.batch_prev_num = self.batch_cur_num self.batch_si = [] self.batch_sobjT = [] self.batch_a = [] else: self.batch_si += [self.environment.last_state["image"]] self.batch_sobjT += [self.environment.last_state["objectType"]] self.batch_a += [self.environment.ACTION_LIST[self.environment.last_action]] action = self.choose_action(pi_values) state, reward, terminal, pixel_change = self.environment.process(action) self.episode_reward[-1] += reward if terminal: ep_info = self.environment._episode_info if ep_info['task'] == 'room_goal': one_hot_room = ep_info['goal']['roomTypeEncoded'] room_type = ep_info['goal']['roomType'] ind = np.where(one_hot_room)[0][0] self.roomType_dict[ind] = room_type self.episode_roomtype += [ind] self.success_rate += [int(self.environment._last_full_state["success"])] self.environment.reset() self.episode_reward += [0]