def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, env_type, env_name, use_lstm, use_pixel_change, use_value_replay, use_reward_prediction, pixel_change_lambda, entropy_beta, local_t_max, gamma, gamma_pc, experience_history_size, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.env_type = env_type self.env_name = env_name self.use_lstm = use_lstm self.use_pixel_change = use_pixel_change self.use_value_replay = use_value_replay self.use_reward_prediction = use_reward_prediction self.local_t_max = local_t_max self.gamma = gamma self.gamma_pc = gamma_pc self.experience_history_size = experience_history_size self.max_global_time_step = max_global_time_step self.action_size = Environment.get_action_size(env_type, env_name) self.objective_size = Environment.get_objective_size(env_type, env_name) self.local_network = UnrealModel(self.action_size, self.objective_size, thread_index, use_lstm, use_pixel_change, use_value_replay, use_reward_prediction, pixel_change_lambda, entropy_beta, device) self.local_network.prepare_loss() self.apply_gradients = grad_applier.minimize_local(self.local_network.total_loss, global_network.get_vars(), self.local_network.get_vars()) self.sync = self.local_network.sync_from(global_network) self.experience = Experience(self.experience_history_size) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # For log output self.prev_local_t = 0
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.action_size = Environment.get_action_size() self.local_network = UnrealModel(self.action_size, thread_index, device) self.local_network.prepare_loss() self.apply_gradients = grad_applier.minimize_local(self.local_network.total_loss, global_network.get_vars(), self.local_network.get_vars()) self.sync = self.local_network.sync_from(global_network) self.environment = Environment.create_environment() self.experience = Experience(EXPERIENCE_HISTORY_SIZE) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # For log output self.prev_local_t = 0
def __init__(self): self.env = Environment.create_environment() if os.path.exists('human_exp.pkl'): with open('human_exp.pkl', 'r') as f: self.ExpPool = pkl.load(f) else: self.ExpPool = Experience(MAX_EXP) pygame.init() self.surface = pygame.display.set_mode(DISP_SIZE, 0) pygame.display.set_caption('Recorder')
def test_process(self): experience = Experience(10, 1) for i in range(10): if i == 5: self._add_frame(experience, 1) else: self._add_frame(experience, 0) self.assertTrue( experience.is_full() ) self.assertTrue( experience._top_frame_index == 0 ) self._add_frame(experience, 0) self.assertTrue( experience._top_frame_index == 1 ) for i in range(100): frames = experience.sample_rp_sequence() self.assertTrue( len(frames) == 4 )
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.action_size = Environment.get_action_size() self.local_network = MapReaderModel(self.action_size, thread_index, device) self.local_network.prepare_loss() self.apply_gradients = grad_applier.minimize_local( self.local_network.total_loss, global_network.get_vars(), self.local_network.get_vars()) self.sync = self.local_network.sync_from(global_network) self.experience = Experience(EXPERIENCE_HISTORY_SIZE) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 self.maze_size = 5 if self.thread_index in range(2): self.maze_size = 13 elif self.thread_index in [2, 3]: self.maze_size = 11 elif self.thread_index in [4, 5]: self.maze_size = 9 elif self.thread_index in [6, 7]: self.maze_size = 7 self.level_seed = np.random.randint(LEVEL_SET_SIZE) # For log output self.prev_local_t = 0 self.last_terminal_local_t = 0 self.steps_buffer = deque() self.correct_exits = 0 self.running = True
class Trainer(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, env_type, env_name, use_pixel_change, use_value_replay, use_reward_prediction, pixel_change_lambda, entropy_beta, local_t_max, gamma, gamma_pc, experience_history_size, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.env_type = env_type self.env_name = env_name self.use_pixel_change = use_pixel_change self.use_value_replay = use_value_replay self.use_reward_prediction = use_reward_prediction self.local_t_max = local_t_max self.gamma = gamma self.gamma_pc = gamma_pc self.experience_history_size = experience_history_size self.max_global_time_step = max_global_time_step self.action_size = Environment.get_action_size(env_type, env_name) self.local_network = UnrealModel(self.action_size, thread_index, use_pixel_change, use_value_replay, use_reward_prediction, pixel_change_lambda, entropy_beta, device) self.local_network.prepare_loss() self.apply_gradients = grad_applier.minimize_local( self.local_network.total_loss, global_network.get_vars(), self.local_network.get_vars()) self.sync = self.local_network.sync_from(global_network) self.experience = Experience(self.experience_history_size) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # For log output self.prev_local_t = 0 def prepare(self): self.environment = Environment.create_environment( self.env_type, self.env_name) def stop(self): self.environment.stop() def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * ( self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, pi_values): return np.random.choice(range(len(pi_values)), p=pi_values) def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t): summary_str = sess.run(summary_op, feed_dict={score_input: score}) summary_writer.add_summary(summary_str, global_t) summary_writer.flush() def set_start_time(self, start_time): self.start_time = start_time def _fill_experience(self, sess): """ Fill experience buffer until buffer is full. """ prev_state = self.environment.last_state last_action = self.environment.last_action last_reward = self.environment.last_reward last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward) pi_, _ = self.local_network.run_base_policy_and_value( sess, self.environment.last_state, last_action_reward) action = self.choose_action(pi_) new_state, reward, terminal, pixel_change = self.environment.process( action) frame = ExperienceFrame(prev_state, reward, action, terminal, pixel_change, last_action, last_reward) self.experience.add_frame(frame) if terminal: self.environment.reset() if self.experience.is_full(): self.environment.reset() print("Replay buffer filled") def _print_log(self, global_t): if (self.thread_index == 0) and (self.local_t - self.prev_local_t >= PERFORMANCE_LOG_INTERVAL): self.prev_local_t += PERFORMANCE_LOG_INTERVAL elapsed_time = time.time() - self.start_time steps_per_sec = global_t / elapsed_time print( "### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour" .format(global_t, elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.)) def _process_base(self, sess, global_t, summary_writer, summary_op, score_input): # [Base A3C] states = [] last_action_rewards = [] actions = [] rewards = [] values = [] terminal_end = False start_lstm_state = self.local_network.base_lstm_state_out # t_max times loop for _ in range(self.local_t_max): # Prepare last action reward last_action = self.environment.last_action last_reward = self.environment.last_reward last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward) #Modify Last State - with attention pi_, value_ = self.local_network.run_base_policy_and_value( sess, self.environment.last_state, last_action_reward) action = self.choose_action(pi_) states.append(self.environment.last_state) last_action_rewards.append(last_action_reward) actions.append(action) values.append(value_) if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0): print("pi={}".format(pi_)) print(" V={}".format(value_)) prev_state = self.environment.last_state # Process game new_state, reward, terminal, pixel_change = self.environment.process( action) #Modify New State - with attention frame = ExperienceFrame(prev_state, reward, action, terminal, pixel_change, last_action, last_reward) # Store to experience self.experience.add_frame(frame) self.episode_reward += reward rewards.append(reward) self.local_t += 1 if terminal: terminal_end = True print("score={}".format(self.episode_reward)) self._record_score(sess, summary_writer, summary_op, score_input, self.episode_reward, global_t) self.episode_reward = 0 self.environment.reset() self.local_network.reset_state() break R = 0.0 if not terminal_end: R = self.local_network.run_base_value( sess, new_state, frame.get_action_reward(self.action_size)) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_a = [] batch_adv = [] batch_R = [] for (ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + self.gamma * R adv = R - Vi a = np.zeros([self.action_size]) a[ai] = 1.0 batch_si.append(si) batch_a.append(a) batch_adv.append(adv) batch_R.append(R) batch_si.reverse() batch_a.reverse() batch_adv.reverse() batch_R.reverse() return batch_si, last_action_rewards, batch_a, batch_adv, batch_R, start_lstm_state def _process_pc(self, sess): # [pixel change] # Sample 20+1 frame (+1 for last next state) pc_experience_frames = self.experience.sample_sequence( self.local_t_max + 1) # Reverse sequence to calculate from the last pc_experience_frames.reverse() batch_pc_si = [] batch_pc_a = [] batch_pc_R = [] batch_pc_last_action_reward = [] pc_R = np.zeros([20, 20], dtype=np.float32) if not pc_experience_frames[1].terminal: pc_R = self.local_network.run_pc_q_max( sess, pc_experience_frames[0].state, pc_experience_frames[0].get_last_action_reward( self.action_size)) for frame in pc_experience_frames[1:]: pc_R = frame.pixel_change + self.gamma_pc * pc_R a = np.zeros([self.action_size]) a[frame.action] = 1.0 last_action_reward = frame.get_last_action_reward(self.action_size) batch_pc_si.append(frame.state) batch_pc_a.append(a) batch_pc_R.append(pc_R) batch_pc_last_action_reward.append(last_action_reward) batch_pc_si.reverse() batch_pc_a.reverse() batch_pc_R.reverse() batch_pc_last_action_reward.reverse() return batch_pc_si, batch_pc_last_action_reward, batch_pc_a, batch_pc_R def _process_vr(self, sess): # [Value replay] # Sample 20+1 frame (+1 for last next state) vr_experience_frames = self.experience.sample_sequence( self.local_t_max + 1) # Reverse sequence to calculate from the last vr_experience_frames.reverse() batch_vr_si = [] batch_vr_R = [] batch_vr_last_action_reward = [] vr_R = 0.0 if not vr_experience_frames[1].terminal: vr_R = self.local_network.run_vr_value( sess, vr_experience_frames[0].state, vr_experience_frames[0].get_last_action_reward( self.action_size)) # t_max times loop for frame in vr_experience_frames[1:]: vr_R = frame.reward + self.gamma * vr_R batch_vr_si.append(frame.state) batch_vr_R.append(vr_R) last_action_reward = frame.get_last_action_reward(self.action_size) batch_vr_last_action_reward.append(last_action_reward) batch_vr_si.reverse() batch_vr_R.reverse() batch_vr_last_action_reward.reverse() return batch_vr_si, batch_vr_last_action_reward, batch_vr_R def _process_rp(self): # [Reward prediction] rp_experience_frames = self.experience.sample_rp_sequence() # 4 frames batch_rp_si = [] batch_rp_c = [] for i in range(3): batch_rp_si.append(rp_experience_frames[i].state) # one hot vector for target reward r = rp_experience_frames[3].reward rp_c = [0.0, 0.0, 0.0] if r == 0: rp_c[0] = 1.0 # zero elif r > 0: rp_c[1] = 1.0 # positive else: rp_c[2] = 1.0 # negative batch_rp_c.append(rp_c) return batch_rp_si, batch_rp_c def process(self, sess, global_t, summary_writer, summary_op, score_input): # Fill experience replay buffer if not self.experience.is_full(): self._fill_experience(sess) return 0 start_local_t = self.local_t cur_learning_rate = self._anneal_learning_rate(global_t) # Copy weights from shared to local sess.run(self.sync) # [Base] batch_si, batch_last_action_rewards, batch_a, batch_adv, batch_R, start_lstm_state = \ self._process_base(sess, global_t, summary_writer, summary_op, score_input) feed_dict = { self.local_network.base_input: batch_si, self.local_network.base_last_action_reward_input: batch_last_action_rewards, self.local_network.base_a: batch_a, self.local_network.base_adv: batch_adv, self.local_network.base_r: batch_R, self.local_network.base_initial_lstm_state: start_lstm_state, # [common] self.learning_rate_input: cur_learning_rate } # [Pixel change] if self.use_pixel_change: batch_pc_si, batch_pc_last_action_reward, batch_pc_a, batch_pc_R = self._process_pc( sess) pc_feed_dict = { self.local_network.pc_input: batch_pc_si, self.local_network.pc_last_action_reward_input: batch_pc_last_action_reward, self.local_network.pc_a: batch_pc_a, self.local_network.pc_r: batch_pc_R } feed_dict.update(pc_feed_dict) # [Value replay] if self.use_value_replay: batch_vr_si, batch_vr_last_action_reward, batch_vr_R = self._process_vr( sess) vr_feed_dict = { self.local_network.vr_input: batch_vr_si, self.local_network.vr_last_action_reward_input: batch_vr_last_action_reward, self.local_network.vr_r: batch_vr_R } feed_dict.update(vr_feed_dict) # [Reward prediction] if self.use_reward_prediction: batch_rp_si, batch_rp_c = self._process_rp() rp_feed_dict = { self.local_network.rp_input: batch_rp_si, self.local_network.rp_c_target: batch_rp_c } feed_dict.update(rp_feed_dict) # Calculate gradients and copy them to global network. sess.run(self.apply_gradients, feed_dict=feed_dict) self._print_log(global_t) # Return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, env_type, env_name, use_lstm, use_pixel_change, use_value_replay, use_reward_prediction, pixel_change_lambda, entropy_beta, local_t_max, n_step_TD, gamma, gamma_pc, experience_history_size, max_global_time_step, device, segnet_param_dict, image_shape, is_training, n_classes, random_state, termination_time, segnet_lambda, dropout): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.env_type = env_type self.env_name = env_name self.use_lstm = use_lstm self.use_pixel_change = use_pixel_change self.use_value_replay = use_value_replay self.use_reward_prediction = use_reward_prediction self.local_t_max = local_t_max self.n_step_TD = n_step_TD self.gamma = gamma self.gamma_pc = gamma_pc self.experience_history_size = experience_history_size self.max_global_time_step = max_global_time_step self.action_size = Environment.get_action_size(env_type, env_name) self.objective_size = Environment.get_objective_size( env_type, env_name) self.segnet_param_dict = segnet_param_dict self.segnet_mode = self.segnet_param_dict.get("segnet_mode", None) self.is_training = is_training self.n_classes = n_classes self.segnet_lambda = segnet_lambda self.run_metadata = tf.RunMetadata() self.many_runs_timeline = TimeLiner() self.random_state = random_state self.termination_time = termination_time self.dropout = dropout try: self.local_network = UnrealModel( self.action_size, self.objective_size, thread_index, use_lstm, use_pixel_change, use_value_replay, use_reward_prediction, pixel_change_lambda, entropy_beta, device, segnet_param_dict=self.segnet_param_dict, image_shape=image_shape, is_training=is_training, n_classes=n_classes, segnet_lambda=self.segnet_lambda, dropout=dropout) self.local_network.prepare_loss() self.apply_gradients = grad_applier.minimize_local( self.local_network.total_loss, global_network.get_vars(), self.local_network.get_vars(), self.thread_index) self.sync = self.local_network.sync_from(global_network) self.experience = Experience(self.experience_history_size, random_state=self.random_state) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # For log output self.prev_local_t = -1 self.prev_local_t_loss = 0 self.sr_size = 50 self.success_rates = deque(maxlen=self.sr_size) except Exception as e: print(str(e)) #, flush=True) raise Exception( "Problem in Trainer {} initialization".format(thread_index))
class Trainer(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, env_type, env_name, use_lstm, use_pixel_change, use_value_replay, use_reward_prediction, pixel_change_lambda, entropy_beta, local_t_max, n_step_TD, gamma, gamma_pc, experience_history_size, max_global_time_step, device, segnet_param_dict, image_shape, is_training, n_classes, random_state, termination_time, segnet_lambda, dropout): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.env_type = env_type self.env_name = env_name self.use_lstm = use_lstm self.use_pixel_change = use_pixel_change self.use_value_replay = use_value_replay self.use_reward_prediction = use_reward_prediction self.local_t_max = local_t_max self.n_step_TD = n_step_TD self.gamma = gamma self.gamma_pc = gamma_pc self.experience_history_size = experience_history_size self.max_global_time_step = max_global_time_step self.action_size = Environment.get_action_size(env_type, env_name) self.objective_size = Environment.get_objective_size( env_type, env_name) self.segnet_param_dict = segnet_param_dict self.segnet_mode = self.segnet_param_dict.get("segnet_mode", None) self.is_training = is_training self.n_classes = n_classes self.segnet_lambda = segnet_lambda self.run_metadata = tf.RunMetadata() self.many_runs_timeline = TimeLiner() self.random_state = random_state self.termination_time = termination_time self.dropout = dropout try: self.local_network = UnrealModel( self.action_size, self.objective_size, thread_index, use_lstm, use_pixel_change, use_value_replay, use_reward_prediction, pixel_change_lambda, entropy_beta, device, segnet_param_dict=self.segnet_param_dict, image_shape=image_shape, is_training=is_training, n_classes=n_classes, segnet_lambda=self.segnet_lambda, dropout=dropout) self.local_network.prepare_loss() self.apply_gradients = grad_applier.minimize_local( self.local_network.total_loss, global_network.get_vars(), self.local_network.get_vars(), self.thread_index) self.sync = self.local_network.sync_from(global_network) self.experience = Experience(self.experience_history_size, random_state=self.random_state) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # For log output self.prev_local_t = -1 self.prev_local_t_loss = 0 self.sr_size = 50 self.success_rates = deque(maxlen=self.sr_size) except Exception as e: print(str(e)) #, flush=True) raise Exception( "Problem in Trainer {} initialization".format(thread_index)) def prepare(self, termination_time=50.0, termination_dist_value=-10.0): self.environment = Environment.create_environment( self.env_type, self.env_name, self.termination_time, thread_index=self.thread_index) def stop(self): self.environment.stop() def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * ( self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, pi_values): return self.random_state.choice(len(pi_values), p=pi_values) def _record_one(self, sess, summary_writer, summary_op, score_input, score, global_t): if self.thread_index >= 0: summary_str = sess.run(summary_op, feed_dict={score_input: score}) for sum_wr in summary_writer: sum_wr.add_summary(summary_str, global_t) def _record_all(self, sess, summary_writer, summary_op, dict_input, dict_eval, global_t): if self.thread_index >= 0: assert set(dict_input.keys()) == set(dict_eval.keys()), print( dict_input.keys(), dict_eval.keys()) feed_dict = {} for key in dict_input.keys(): feed_dict.update({dict_input[key]: dict_eval[key]}) summary_str = sess.run(summary_op, feed_dict=feed_dict) for sum_wr in summary_writer: sum_wr.add_summary(summary_str, global_t) def set_start_time(self, start_time): self.start_time = start_time def _fill_experience(self, sess): """ Fill experience buffer until buffer is full. """ #print("Start experience filling", flush=True) prev_state = self.environment.last_state last_action = self.environment.last_action last_reward = self.environment.last_reward last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward, prev_state) #print("Local network run base policy, value!", flush=True) pi_, _, _ = self.local_network.run_base_policy_and_value( sess, self.environment.last_state, last_action_reward) action = self.choose_action(pi_) new_state, reward, terminal, pixel_change = self.environment.process( action, flag=0) frame = ExperienceFrame( { key: val for key, val in prev_state.items() if 'objectType' not in key }, reward, action, terminal, pixel_change, last_action, last_reward) self.experience.add_frame(frame) if terminal: self.environment.reset() if self.experience.is_full(): self.environment.reset() print("Replay buffer filled") def _print_log(self, global_t): if (self.thread_index == 0) and (self.local_t - self.prev_local_t >= PERFORMANCE_LOG_INTERVAL): self.prev_local_t += PERFORMANCE_LOG_INTERVAL elapsed_time = time.time() - self.start_time steps_per_sec = global_t / elapsed_time print( "### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour" .format(global_t, elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.)) #, flush=True) # print("### Experience : {}".format(self.experience.get_debug_string())) def _process_base(self, sess, global_t, summary_writer, summary_op_dict, summary_dict): #, losses_input): # [Base A3C] states = [] last_action_rewards = [] actions = [] rewards = [] values = [] terminal_end = False start_lstm_state = None if self.use_lstm: start_lstm_state = self.local_network.base_lstm_state_out mode = "segnet" if self.segnet_mode >= 2 else "" # t_max times loop flag = 0 for _ in range(self.n_step_TD): # Prepare last action reward last_action = self.environment.last_action last_reward = self.environment.last_reward last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward, self.environment.last_state) pi_, value_, losses = self.local_network.run_base_policy_and_value( sess, self.environment.last_state, last_action_reward, mode) action = self.choose_action(pi_) states.append(self.environment.last_state) last_action_rewards.append(last_action_reward) actions.append(action) values.append(value_) if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0): print("Trainer {}>>> Local step {}:".format( self.thread_index, self.local_t)) print("Trainer {}>>> pi={}".format(self.thread_index, pi_)) print("Trainer {}>>> V={}".format(self.thread_index, value_)) flag = 1 prev_state = self.environment.last_state # Process game new_state, reward, terminal, pixel_change = self.environment.process( action, flag=flag) frame = ExperienceFrame( { key: val for key, val in prev_state.items() if 'objectType' not in key }, reward, action, terminal, pixel_change, last_action, last_reward) # Store to experience self.experience.add_frame(frame) # Use to know about Experience collection #print(self.experience.get_debug_string()) self.episode_reward += reward rewards.append(reward) self.local_t += 1 if terminal: terminal_end = True print("Trainer {}>>> score={}".format( self.thread_index, self.episode_reward)) #, flush=True) summary_dict['values'].update( {'score_input': self.episode_reward}) success = 1 if self.environment._last_full_state[ "success"] else 0 #print("Type:", type(self.environment._last_full_state["success"]), len(self.success_rates), success) self.success_rates.append(success) summary_dict['values'].update({ 'sr_input': np.mean(self.success_rates) if len(self.success_rates) == self.sr_size else 0 }) self.episode_reward = 0 self.environment.reset() self.local_network.reset_state() if flag: flag = 0 break R = 0.0 if not terminal_end: R = self.local_network.run_base_value( sess, new_state, frame.get_action_reward(self.action_size)) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_a = [] batch_adv = [] batch_R = [] batch_sobjT = [] for (ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + self.gamma * R adv = R - Vi a = np.zeros([self.action_size]) a[ai] = 1.0 batch_si.append(si['image']) batch_a.append(a) batch_adv.append(adv) batch_R.append(R) if self.segnet_param_dict["segnet_mode"] >= 2: batch_sobjT.append(si['objectType']) batch_si.reverse() batch_a.reverse() batch_adv.reverse() batch_R.reverse() batch_sobjT.reverse() #print(np.unique(batch_sobjT)) ## HERE Mathematical Error A3C: only last values should be used for base/ or aggregate with last made return batch_si, batch_sobjT, last_action_rewards, batch_a, batch_adv, batch_R, start_lstm_state def _process_pc(self, sess): # [pixel change] # Sample 20+1 frame (+1 for last next state) #print(">>> Process run!", flush=True) pc_experience_frames = self.experience.sample_sequence( self.local_t_max + 1) # Reverse sequence to calculate from the last # pc_experience_frames.reverse() pc_experience_frames = pc_experience_frames[::-1] #print(">>> Process ran!", flush=True) batch_pc_si = [] batch_pc_a = [] batch_pc_R = [] batch_pc_last_action_reward = [] pc_R = np.zeros([20, 20], dtype=np.float32) if not pc_experience_frames[1].terminal: pc_R = self.local_network.run_pc_q_max( sess, pc_experience_frames[0].state, pc_experience_frames[0].get_last_action_reward( self.action_size)) #print(">>> Process run!", flush=True) for frame in pc_experience_frames[1:]: pc_R = frame.pixel_change + self.gamma_pc * pc_R a = np.zeros([self.action_size]) a[frame.action] = 1.0 last_action_reward = frame.get_last_action_reward(self.action_size) batch_pc_si.append(frame.state['image']) batch_pc_a.append(a) batch_pc_R.append(pc_R) batch_pc_last_action_reward.append(last_action_reward) batch_pc_si.reverse() batch_pc_a.reverse() batch_pc_R.reverse() batch_pc_last_action_reward.reverse() #print(">>> Process ended!", flush=True) return batch_pc_si, batch_pc_last_action_reward, batch_pc_a, batch_pc_R def _process_vr(self, sess): # [Value replay] # Sample 20+1 frame (+1 for last next state) vr_experience_frames = self.experience.sample_sequence( self.local_t_max + 1) # Reverse sequence to calculate from the last vr_experience_frames.reverse() batch_vr_si = [] batch_vr_R = [] batch_vr_last_action_reward = [] vr_R = 0.0 if not vr_experience_frames[1].terminal: vr_R = self.local_network.run_vr_value( sess, vr_experience_frames[0].state, vr_experience_frames[0].get_last_action_reward( self.action_size)) # t_max times loop for frame in vr_experience_frames[1:]: vr_R = frame.reward + self.gamma * vr_R batch_vr_si.append(frame.state['image']) batch_vr_R.append(vr_R) last_action_reward = frame.get_last_action_reward(self.action_size) batch_vr_last_action_reward.append(last_action_reward) batch_vr_si.reverse() batch_vr_R.reverse() batch_vr_last_action_reward.reverse() return batch_vr_si, batch_vr_last_action_reward, batch_vr_R def _process_rp(self): # [Reward prediction] rp_experience_frames = self.experience.sample_rp_sequence() # 4 frames batch_rp_si = [] batch_rp_c = [] for i in range(3): batch_rp_si.append(rp_experience_frames[i].state['image']) # one hot vector for target reward r = rp_experience_frames[3].reward rp_c = [0.0, 0.0, 0.0] if -1e-10 < r < 1e-10: rp_c[0] = 1.0 # zero elif r > 0: rp_c[1] = 1.0 # positive else: rp_c[2] = 1.0 # negative batch_rp_c.append(rp_c) return batch_rp_si, batch_rp_c def process(self, sess, global_t, summary_writer, summary_op_dict, score_input, sr_input, eval_input, entropy_input, term_global_t, losses_input): if self.prev_local_t == -1 and self.segnet_mode >= 2: self.prev_local_t = 0 sess.run(self.local_network.reset_evaluation_vars) # Fill experience replay buffer #print("Inside train process of thread!", flush=True) if not self.experience.is_full(): self._fill_experience(sess) return 0, None start_local_t = self.local_t episode_score = None cur_learning_rate = self._anneal_learning_rate(global_t) #print("Weights copying!", flush=True) # Copy weights from shared to local sess.run(self.sync) #print("Weights copied successfully!", flush=True) summary_dict = {'placeholders': {}, 'values': {}} summary_dict['placeholders'].update(losses_input) # [Base] #print("[Base]", flush=True) batch_si, batch_sobjT, batch_last_action_rewards, batch_a, batch_adv, batch_R, start_lstm_state, = \ self._process_base(sess, global_t, summary_writer, summary_op_dict, summary_dict) if summary_dict['values'].get('score_input', None) is not None: self._record_one(sess, summary_writer, summary_op_dict['score_input'], score_input, summary_dict['values']['score_input'], global_t) self._record_one(sess, summary_writer, summary_op_dict['sr_input'], sr_input, summary_dict['values']['sr_input'], global_t) #self._record_one(sess, summary_writer, summary_op_dict['term_global_t'], term_global_t, # global_t, global_t) #summary_writer[0].flush() # summary_writer[1].flush() # Return advanced local step size episode_score = summary_dict['values'].get('score_input', None) summary_dict['values'] = {} feed_dict = { self.local_network.base_input: batch_si, self.local_network.base_last_action_reward_input: batch_last_action_rewards, self.local_network.base_a: batch_a, self.local_network.base_adv: batch_adv, self.local_network.base_r: batch_R, # [common] self.learning_rate_input: cur_learning_rate, self.is_training: True } if self.use_lstm: feed_dict[ self.local_network.base_initial_lstm_state] = start_lstm_state if self.segnet_param_dict["segnet_mode"] >= 2: feed_dict[self.local_network.base_segm_mask] = batch_sobjT #print("[Pixel change]", flush=True) # [Pixel change] if self.use_pixel_change: batch_pc_si, batch_pc_last_action_reward, batch_pc_a, batch_pc_R = self._process_pc( sess) pc_feed_dict = { self.local_network.pc_input: batch_pc_si, self.local_network.pc_last_action_reward_input: batch_pc_last_action_reward, self.local_network.pc_a: batch_pc_a, self.local_network.pc_r: batch_pc_R } feed_dict.update(pc_feed_dict) #print("[Value replay]", flush=True) # [Value replay] if self.use_value_replay: batch_vr_si, batch_vr_last_action_reward, batch_vr_R = self._process_vr( sess) vr_feed_dict = { self.local_network.vr_input: batch_vr_si, self.local_network.vr_last_action_reward_input: batch_vr_last_action_reward, self.local_network.vr_r: batch_vr_R } feed_dict.update(vr_feed_dict) # [Reward prediction] #print("[Reward prediction]", flush=True) if self.use_reward_prediction: batch_rp_si, batch_rp_c = self._process_rp() rp_feed_dict = { self.local_network.rp_input: batch_rp_si, self.local_network.rp_c_target: batch_rp_c } feed_dict.update(rp_feed_dict) #print(len(batch_rp_c), batch_rp_c) grad_check = None #if self.local_t - self.prev_local_t_loss >= LOSS_AND_EVAL_LOG_INTERVAL: # grad_check = [tf.add_check_numerics_ops()] #print("Applying gradients in train!", flush=True) # Calculate gradients and copy them to global network. out_list = [self.apply_gradients] out_list += [ self.local_network.total_loss, self.local_network.base_loss, self.local_network.policy_loss, self.local_network.value_loss, self.local_network.entropy ] if self.segnet_mode >= 2: out_list += [self.local_network.decoder_loss] out_list += [self.local_network.regul_loss] if self.use_pixel_change: out_list += [self.local_network.pc_loss] if self.use_value_replay: out_list += [self.local_network.vr_loss] if self.use_reward_prediction: out_list += [self.local_network.rp_loss] if self.segnet_mode >= 2: out_list += [self.local_network.update_evaluation_vars] if self.local_t - self.prev_local_t_loss >= LOSS_AND_EVAL_LOG_INTERVAL: out_list += [self.local_network.evaluation] import time now = time.time() with tf.control_dependencies(grad_check): if GPU_LOG: return_list = sess.run(out_list, feed_dict=feed_dict, options=run_options, run_metadata=self.run_metadata) else: return_list = sess.run(out_list, feed_dict=feed_dict, options=run_options) if time.time() - now > 30.0: print( "Too much time on sess.run: check tensorflow") #, flush=True) sys.exit(0) raise ValueError("More than 100 seconds update in tensorflow!") # gradients_tuple, total_loss, base_loss, policy_loss, value_loss, entropy = return_list[: 6] grad_norm = gradients_tuple[1] return_list = return_list[6:] return_string = "Trainer {}>>> Total loss: {}, Base loss: {}\n".format( self.thread_index, total_loss, base_loss) return_string += "\t\tPolicy loss: {}, Value loss: {}, Grad norm: {}\nEntropy: {}\n".format( policy_loss, value_loss, grad_norm, entropy) losses_eval = { 'all/total_loss': total_loss, 'all/base_loss': base_loss, 'all/policy_loss': policy_loss, 'all/value_loss': value_loss, 'all/loss/grad_norm': grad_norm } if self.segnet_mode >= 2: decoder_loss, l2_loss = return_list[:2] return_list = return_list[2:] return_string += "\t\tDecoder loss: {}, L2 weights loss: {}\n".format( decoder_loss, l2_loss) losses_eval.update({ 'all/decoder_loss': decoder_loss, 'all/l2_weights_loss': l2_loss }) if self.use_pixel_change: pc_loss = return_list[0] return_list = return_list[1:] return_string += "\t\tPC loss: {}\n".format(pc_loss) losses_eval.update({'all/pc_loss': pc_loss}) if self.use_value_replay: vr_loss = return_list[0] return_list = return_list[1:] return_string += "\t\tVR loss: {}\n".format(vr_loss) losses_eval.update({'all/vr_loss': vr_loss}) if self.use_reward_prediction: rp_loss = return_list[0] return_list = return_list[1:] return_string += "\t\tRP loss: {}\n".format(rp_loss) losses_eval.update({'all/rp_loss': rp_loss}) if self.local_t - self.prev_local_t_loss >= LOSS_AND_EVAL_LOG_INTERVAL: if self.segnet_mode >= 2: return_string += "\t\tmIoU: {}\n".format(return_list[-1]) summary_dict['values'].update(losses_eval) # Printing losses if self.local_t - self.prev_local_t_loss >= LOSS_AND_EVAL_LOG_INTERVAL: if self.segnet_mode >= 2: self._record_one(sess, summary_writer, summary_op_dict['eval_input'], eval_input, return_list[-1], global_t) self._record_one(sess, summary_writer, summary_op_dict['entropy'], entropy_input, entropy, global_t) # summary_writer[0].flush() # summary_writer[1].flush() print(return_string) self.prev_local_t_loss += LOSS_AND_EVAL_LOG_INTERVAL if GPU_LOG: fetched_timeline = timeline.Timeline(self.run_metadata.step_stats) chrome_trace = fetched_timeline.generate_chrome_trace_format() self.many_runs_timeline.update_timeline(chrome_trace) self._print_log(global_t) #Recording score and losses self._record_all(sess, summary_writer, summary_op_dict['losses_input'], summary_dict['placeholders'], summary_dict['values'], global_t) # Return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t, episode_score
class Trainer(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.action_size = Environment.get_action_size() self.local_network = MapReaderModel(self.action_size, thread_index, device) self.local_network.prepare_loss() self.apply_gradients = grad_applier.minimize_local( self.local_network.total_loss, global_network.get_vars(), self.local_network.get_vars()) self.sync = self.local_network.sync_from(global_network) self.experience = Experience(EXPERIENCE_HISTORY_SIZE) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 self.maze_size = 5 if self.thread_index in range(2): self.maze_size = 13 elif self.thread_index in [2, 3]: self.maze_size = 11 elif self.thread_index in [4, 5]: self.maze_size = 9 elif self.thread_index in [6, 7]: self.maze_size = 7 self.level_seed = np.random.randint(LEVEL_SET_SIZE) # For log output self.prev_local_t = 0 self.last_terminal_local_t = 0 self.steps_buffer = deque() self.correct_exits = 0 self.running = True def prepare(self): if self.running: self.environment = Environment.create_environment( self.maze_size, self.level_seed) print('Started trainer ', self.thread_index) self.apply_next_location_loss = 0.0 sys.stdout.flush() def stop(self): self.environment.stop() self.last_terminal_local_t = self.local_t def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * ( self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate @staticmethod def choose_action(pi_values): return np.random.choice(range(len(pi_values)), p=pi_values) def set_start_time(self, start_time): self.start_time = start_time def _fill_experience(self, sess): """ Fill experience buffer until buffer is full. """ prev_state = self.environment.last_state last_action = self.environment.last_action last_reward = self.environment.last_reward last_intrinsic_reward = self.environment.last_intrinsic_reward last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward) input_map = self.environment.map prev_localization_state, pi_, _, short_term_goal, shift_weights, location_distribution = self.local_network.run_policy_and_value( sess, prev_state, last_action_reward, input_map, replan=False) action = self.choose_action(pi_) new_state, reward, intrinsic_reward, terminal = self.environment.process( action, short_term_goal, shift_weights) frame = ExperienceFrame(prev_state, input_map, prev_localization_state, location_distribution, reward, intrinsic_reward, action, terminal, last_action, last_reward, last_intrinsic_reward) self.experience.add_frame(frame) if terminal: self.level_seed = np.random.randint(LEVEL_SET_SIZE) self.environment.reset(self.maze_size, self.level_seed) if self.experience.is_full(): print( "Replay buffer filled--------------------------------------------------------------------------------------" ) sys.stdout.flush() def _process_base(self, sess, global_t, map_input): # [Base A3C] states = [] actions = [] batch_last_action_rewards = [] rewards = [] values = [] terminal_end = False replan = (self.apply_next_location_loss == 0.0) start_localization_state = self.local_network.localization_state_out # t_max times loop for _ in range(LOCAL_T_MAX): self.local_t += 1 # Previous state prev_state = self.environment.last_state last_action = self.environment.last_action last_reward = self.environment.last_reward last_intrinsic_reward = self.environment.last_intrinsic_reward last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward) prev_localization_state, pi_, value_, short_term_goal, shift_weights, location_distribution = self.local_network.run_policy_and_value( sess, prev_state, last_action_reward, map_input, replan) replan = False action = self.choose_action(pi_) states.append(prev_state) actions.append( ExperienceFrame.get_action_neurons(action, self.action_size)) batch_last_action_rewards.append(last_action_reward) values.append(value_) if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0): print("pi={}".format(pi_)) print(" V={}".format(value_)) # Process game new_state, reward, intrinsic_reward, terminal = self.environment.process( action, short_term_goal, shift_weights) frame = ExperienceFrame(prev_state, map_input, prev_localization_state, location_distribution, reward, intrinsic_reward, action, terminal, last_action, last_reward, last_intrinsic_reward) # Store to experience self.experience.add_frame(frame) self.episode_reward += reward + intrinsic_reward rewards.append(reward + intrinsic_reward) if terminal: terminal_end = True if reward > 0: self.correct_exits += 1 steps_needed = self.local_t - self.last_terminal_local_t self.last_terminal_local_t = self.local_t self.steps_buffer.append(steps_needed) if len(self.steps_buffer) > 50: self.steps_buffer.popleft() print("Steps needed: ", steps_needed) print("score={}".format(self.episode_reward)) self.episode_reward = 0 if (np.mean(self.steps_buffer) < 100 + (self.maze_size - 7) * 20 and len(self.steps_buffer) == 50): self.maze_size += 2 if self.maze_size > 13: print(">>>>>>>>>>> REACHED END <<<<<<<<<<<") self.environment.stop() sys.stdout.flush() self.running = False break print(">>>>>> SWITCHING TO MAZES OF SIZE ", self.maze_size, "x", self.maze_size, " AT GLOBAL T ", global_t, " <<<<<<<<<<<<<<<") sys.stdout.flush() #reset moving average self.correct_exits = 0 self.steps_buffer = deque() self.level_seed = np.random.randint(LEVEL_SET_SIZE) self.environment.reset(self.maze_size, self.level_seed) self.local_network.reset_state() break last_action_reward = ExperienceFrame.concat_action_and_reward( action, self.action_size, reward) R = 0.0 if not terminal_end: R = self.local_network.run_value(sess, new_state, last_action_reward, frame.map) self.apply_next_location_loss = 1.0 else: self.apply_next_location_loss = 0.0 states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_adv = [] batch_R = [] for (ri, si, Vi) in zip(rewards, states, values): R = ri + GAMMA * R adv = R - Vi batch_si.append(si) batch_adv.append(adv) batch_R.append(R) batch_si.reverse() batch_adv.reverse() batch_R.reverse() return batch_si, batch_last_action_rewards, actions, batch_adv, batch_R, start_localization_state def process(self, sess, global_t, summary_writer): # Fill experience replay buffer if not self.experience.is_full(): self._fill_experience(sess) return 0 start_local_t = self.local_t cur_learning_rate = self._anneal_learning_rate(global_t) apply_location_loss = self.apply_next_location_loss # Copy weights from shared to local sess.run(self.sync) # [Base] map_input = self.environment.map batch_si, batch_last_action_rewards, batch_actions, batch_adv, batch_R, start_localization_state = \ self._process_base(sess,global_t, map_input) vlm_frames = np.random.choice(self.experience.get_frames(), LOCAL_T_MAX) feed_dict = { self.local_network.input: map(lambda st: st['view'], batch_si), self.local_network.map: [map_input], self.local_network.replan: True, # force replanning with updated reward model self.local_network.old_plan: np.zeros([1, 63, 63, 4]), self.local_network.last_action_reward: batch_last_action_rewards, self.local_network.angle_neurons: map(lambda st: st['angle'][0], batch_si), self.local_network.initial_localization_state: start_localization_state, # loss inputs self.local_network.a: batch_actions, self.local_network.adv: batch_adv, self.local_network.r: batch_R, self.local_network.location_loss_gate: apply_location_loss, self.local_network.position_indices: map(lambda st: st['position'][2], batch_si), self.local_network.location_probability_target: map(lambda st: st['position'][0], batch_si), # visual local map network self.local_network.visual_local_map_target: map(lambda f: f.state['vlm'], vlm_frames), self.local_network.vlm_view_input: map(lambda f: f.state['view'], vlm_frames), self.local_network.vlm_angle: map(lambda f: f.state['angle'][0], vlm_frames), # [common] self.learning_rate_input: cur_learning_rate } # Map reward prediction map_rp_frames = [] map_rp_classes = [] for _ in range(LOCAL_T_MAX): rp_frame = self.experience.sample_rp_frame() rp_c = [0.0, 0.0, 0.0] if rp_frame.reward == 0: rp_c[0] = 1.0 # zero elif rp_frame.reward > 0: rp_c[1] = 1.0 # positive else: rp_c[2] = 1.0 # negative map_rp_frames.append(rp_frame) map_rp_classes.append(rp_c) feed_dict.update({ self.local_network.map_rp_location_distribution: map(lambda f: f.location_distribution, map_rp_frames), self.local_network.map_rp_maps: map(lambda f: f.map, map_rp_frames), # loss input self.local_network.map_rp_c_target: map_rp_classes, self.local_network.map_rp_loss_gate: 1.0 }) # Calculate gradients and copy them to global netowrk. sess.run(self.apply_gradients, feed_dict=feed_dict) self._print_log(global_t, sess, feed_dict, summary_writer) # Return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t def _print_log(self, global_t, sess, feed_dict, summary_writer): if (self.local_t - self.prev_local_t >= PERFORMANCE_LOG_INTERVAL): self.prev_local_t += PERFORMANCE_LOG_INTERVAL if (self.thread_index == 0): elapsed_time = time.time() - self.start_time steps_per_sec = global_t / elapsed_time print( "### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour" .format(global_t, elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.)) if self.steps_buffer: print('--- Thread ', self.thread_index, ' at global_t ', global_t, ' reached ', self.correct_exits, ' exits in ', np.mean(self.steps_buffer), ' steps on average in mazes of size ', self.maze_size, 'x', self.maze_size) feed_dict.update({ self.local_network.episode: self.maze_size, self.local_network.steps: self.steps_buffer }) summary_str = sess.run(self.local_network.summary_op, feed_dict=feed_dict) summary_writer.add_summary(summary_str, global_t) summary_writer.flush() sys.stdout.flush()
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, env_type, env_name, use_pixel_change, use_value_replay, use_reward_prediction, pixel_change_lambda, entropy_beta, vf_coeff, local_t_max, gamma, gamma_pc, experience_history_size, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.env_type = env_type self.env_name = env_name self.use_pixel_change = use_pixel_change self.use_value_replay = use_value_replay self.use_reward_prediction = use_reward_prediction self.local_t_max = local_t_max self.gamma = gamma self.gamma_pc = gamma_pc self.experience_history_size = experience_history_size self.max_global_time_step = max_global_time_step self.action_size = Environment.get_action_size(env_type, env_name) self.local_network = UnrealModel(self.action_size, thread_index, use_pixel_change, use_value_replay, use_reward_prediction, pixel_change_lambda, entropy_beta, vf_coeff, device) #self.local_network.prepare_loss() #adding things for acktr self.local_network.prepare_loss_acktr() self.optim = optim = KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\ momentum=0.9, kfac_update=1, epsilon=0.01,\ stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm) update_stats_op = optim.compute_and_apply_stats(self.local_network.fisher_loss, var_list=self.local_network.params) train_op, q_runner = optim.apply_gradients(list(zip(self.local_network.grads,self.local_network.params))) #update the rest according to normal stuff self.apply_gradients = grad_applier.minimize_local(self.local_network.intermediate_loss, global_network.get_vars(), self.local_network.get_vars()) self.sync = self.local_network.sync_from(global_network) self.experience = Experience(self.experience_history_size) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # For log output self.prev_local_t = 0
def run(self): device = "/cpu:0" if USE_GPU: device = "/gpu:0" logger.debug("start App") initial_learning_rate = flags.initial_learning_rate self.global_t = 0 self.aux_t = 0 self.stop_requested = False self.terminate_requested = False logger.debug("getting action size and observation size...") action_size = Environment.get_action_size(flags.env_type, flags.env_name) obs_size = Environment.get_obs_size(flags.env_type, flags.env_name) # Setup Global Network logger.debug("loading global model...") self.global_network = UnrealModel( action_size, obs_size, -1, flags.entropy_beta, device, use_pixel_change=flags.use_pixel_change, use_value_replay=flags.use_value_replay, use_reward_prediction=flags.use_reward_prediction, use_temporal_coherence=flags.use_temporal_coherence, use_proportionality=flags.use_proportionality, use_causality=flags.use_causality, use_repeatability=flags.use_repeatability, value_lambda=flags.value_lambda, pixel_change_lambda=flags.pixel_change_lambda, temporal_coherence_lambda=flags.temporal_coherence_lambda, proportionality_lambda=flags.proportionality_lambda, causality_lambda=flags.causality_lambda, repeatability_lambda=flags.repeatability_lambda) logger.debug("done loading global model") learning_rate_input = tf.placeholder("float") # Setup gradient calculator #""" grad_applier = RMSPropApplier( learning_rate=learning_rate_input, #decay = flags.rmsp_alpha, momentum=0.0, #epsilon = flags.rmsp_epsilon, clip_norm=flags.grad_norm_clip, device=device) """ grad_applier = AdamApplier(learning_rate = learning_rate_input, clip_norm=flags.grad_norm_clip, device=device) """ # Start environment self.environment = Environment.create_environment( flags.env_type, flags.env_name) logger.debug("done loading environment") # Setup runner self.runner = RunnerThread(flags, self.environment, self.global_network, action_size, obs_size, device, visualise) logger.debug("done setting up RunnerTread") # Setup experience self.experience = Experience(flags.experience_history_size) #@TODO check device usage: should we build a cluster? # Setup Base Network self.base_trainer = BaseTrainer( self.runner, self.global_network, initial_learning_rate, learning_rate_input, grad_applier, flags.env_type, flags.env_name, flags.entropy_beta, flags.gamma, self.experience, flags.max_time_step, device, flags.value_lambda) # Setup Aux Networks self.aux_trainers = [] for k in range(flags.parallel_size): self.aux_trainers.append( AuxTrainer( self.global_network, k + 2, #-1 is global, 0 is runnerthread, 1 is base flags.use_base, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, flags.use_temporal_coherence, flags.use_proportionality, flags.use_causality, flags.use_repeatability, flags.value_lambda, flags.pixel_change_lambda, flags.temporal_coherence_lambda, flags.proportionality_lambda, flags.causality_lambda, flags.repeatability_lambda, flags.aux_initial_learning_rate, learning_rate_input, grad_applier, self.aux_t, flags.env_type, flags.env_name, flags.entropy_beta, flags.local_t_max, flags.gamma, flags.aux_lambda, flags.gamma_pc, self.experience, flags.max_time_step, device)) # Start tensorflow session config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) self.sess.run(tf.global_variables_initializer()) self.init_tensorboard() # init or load checkpoint with saver self.saver = tf.train.Saver(self.global_network.get_vars()) checkpoint = tf.train.get_checkpoint_state(flags.checkpoint_dir) if CONTINUE_TRAINING and checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) checkpointpath = checkpoint.model_checkpoint_path.replace( "/", "\\") logger.info("checkpoint loaded: {}".format(checkpointpath)) tokens = checkpoint.model_checkpoint_path.split("-") # set global step self.global_t = int(tokens[1]) logger.info(">>> global step set: {}".format(self.global_t)) logger.info(">>> aux step: {}".format(self.aux_t)) # set wall time wall_t_fname = flags.checkpoint_dir + '/' + 'wall_t.' + str( self.global_t) with open(wall_t_fname, 'r') as f: self.wall_t = float(f.read()) self.next_save_steps = ( self.global_t + flags.save_interval_step ) // flags.save_interval_step * flags.save_interval_step logger.debug("next save steps:{}".format(self.next_save_steps)) else: logger.info("Could not find old checkpoint") # set wall time self.wall_t = 0.0 self.next_save_steps = flags.save_interval_step signal.signal(signal.SIGINT, self.signal_handler) # set start time self.start_time = time.time() - self.wall_t # Start runner self.runner.start_runner(self.sess) # Start base_network thread self.base_train_thread = threading.Thread( target=self.base_train_function, args=()) self.base_train_thread.start() # Start aux_network threads self.aux_train_threads = [] for k in range(flags.parallel_size): self.aux_train_threads.append( threading.Thread(target=self.aux_train_function, args=(k, ))) self.aux_train_threads[k].start() logger.debug(threading.enumerate()) logger.info('Press Ctrl+C to stop') signal.pause()
class Trainer(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, env_type, env_name, use_pixel_change, use_value_replay, use_reward_prediction, use_future_reward_prediction, use_autoencoder, reward_length, pixel_change_lambda, entropy_beta, local_t_max, gamma, gamma_pc, experience_history_size, max_global_time_step, device, log_file, skip_step): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.env_type = env_type self.env_name = env_name self.use_pixel_change = use_pixel_change self.use_value_replay = use_value_replay self.use_reward_prediction = use_reward_prediction self.use_future_reward_prediction = use_future_reward_prediction self.use_autoencoder = use_autoencoder self.local_t_max = local_t_max self.gamma = gamma self.gamma_pc = gamma_pc self.experience_history_size = experience_history_size self.max_global_time_step = max_global_time_step self.skip_step = skip_step self.action_size = Environment.get_action_size(env_type, env_name) self.local_network = UnrealModel(self.action_size, thread_index, use_pixel_change, use_value_replay, use_reward_prediction, use_future_reward_prediction, use_autoencoder, pixel_change_lambda, entropy_beta, device) self.local_network.prepare_loss() self.apply_gradients = grad_applier.minimize_local( self.local_network.total_loss, global_network.get_vars(), self.local_network.get_vars()) self.sync = self.local_network.sync_from(global_network) self.experience = Experience(self.experience_history_size, reward_length) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # For log output self.prev_local_t = 0 self.log_file = log_file self.prediction_res_file = log_file + '/' + 'res.pkl' def prepare(self): self.environment = Environment.create_environment( self.env_type, self.env_name, self.skip_step) def stop(self): self.environment.stop() def add_summary(self, step, name, value, writer): summary = tf.Summary() summary_value = summary.value.add() summary_value.simple_value = float(value) summary_value.tag = name writer.add_summary(summary, step) def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * ( self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, pi_values): return np.random.choice(range(len(pi_values)), p=pi_values) def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t): summary_str = sess.run(summary_op, feed_dict={score_input: score}) summary_writer.add_summary(summary_str, global_t) summary_writer.flush() def set_start_time(self, start_time): self.start_time = start_time def _fill_experience(self, sess): """ Fill experience buffer until buffer is full. """ prev_state = self.environment.last_state last_action = self.environment.last_action last_reward = self.environment.last_reward last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward) pi_, _ = self.local_network.run_base_policy_and_value( sess, self.environment.last_state, last_action_reward) action = self.choose_action(pi_) new_state, reward, terminal, pixel_change = self.environment.process( action) frame = ExperienceFrame(prev_state, reward, action, terminal, pixel_change, last_action, last_reward) self.experience.add_frame(frame) if terminal: self.environment.reset() if self.experience.is_full(): self.environment.reset() print("Replay buffer filled") def _print_log(self, global_t): if (self.thread_index == 0) and (self.local_t - self.prev_local_t >= PERFORMANCE_LOG_INTERVAL): self.prev_local_t += PERFORMANCE_LOG_INTERVAL elapsed_time = time.time() - self.start_time steps_per_sec = global_t / elapsed_time print( "### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour" .format(global_t, elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.)) def _process_base(self, sess, global_t, summary_writer, summary_op, score_input): # [Base A3C] states = [] last_action_rewards = [] actions = [] rewards = [] values = [] terminal_end = False start_lstm_state = self.local_network.base_lstm_state_out # t_max times loop for _ in range(self.local_t_max): # Prepare last action reward last_action = self.environment.last_action last_reward = self.environment.last_reward last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward) pi_, value_ = self.local_network.run_base_policy_and_value( sess, self.environment.last_state, last_action_reward) action = self.choose_action(pi_) states.append(self.environment.last_state) last_action_rewards.append(last_action_reward) actions.append(action) values.append(value_) if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0): print("pi={}".format(pi_)) print(" V={}".format(value_)) prev_state = self.environment.last_state # Process game new_state, reward, terminal, pixel_change = self.environment.process( action) frame = ExperienceFrame(prev_state, reward, action, terminal, pixel_change, last_action, last_reward) # Store to experience self.experience.add_frame(frame) self.episode_reward += reward rewards.append(reward) self.local_t += 1 if terminal: terminal_end = True print("score={}".format(self.episode_reward)) self._record_score(sess, summary_writer, summary_op, score_input, self.episode_reward, global_t) self.episode_reward = 0 self.environment.reset() self.local_network.reset_state() break R = 0.0 if not terminal_end: R = self.local_network.run_base_value( sess, new_state, frame.get_action_reward(self.action_size)) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_a = [] batch_adv = [] batch_R = [] for (ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + self.gamma * R adv = R - Vi a = np.zeros([self.action_size]) a[ai] = 1.0 batch_si.append(si) batch_a.append(a) batch_adv.append(adv) batch_R.append(R) batch_si.reverse() batch_a.reverse() batch_adv.reverse() batch_R.reverse() return batch_si, last_action_rewards, batch_a, batch_adv, batch_R, start_lstm_state def _process_pc(self, sess): # [pixel change] # Sample 20+1 frame (+1 for last next state) pc_experience_frames = self.experience.sample_sequence( self.local_t_max + 1) # Reverse sequence to calculate from the last pc_experience_frames.reverse() batch_pc_si = [] batch_pc_a = [] batch_pc_R = [] batch_pc_last_action_reward = [] pc_R = np.zeros([20, 20], dtype=np.float32) if not pc_experience_frames[1].terminal: pc_R = self.local_network.run_pc_q_max( sess, pc_experience_frames[0].state, pc_experience_frames[0].get_last_action_reward( self.action_size)) for frame in pc_experience_frames[1:]: pc_R = frame.pixel_change + self.gamma_pc * pc_R a = np.zeros([self.action_size]) a[frame.action] = 1.0 last_action_reward = frame.get_last_action_reward(self.action_size) batch_pc_si.append(frame.state) batch_pc_a.append(a) batch_pc_R.append(pc_R) batch_pc_last_action_reward.append(last_action_reward) batch_pc_si.reverse() batch_pc_a.reverse() batch_pc_R.reverse() batch_pc_last_action_reward.reverse() return batch_pc_si, batch_pc_last_action_reward, batch_pc_a, batch_pc_R def _process_vr(self, sess): # [Value replay] # Sample 20+1 frame (+1 for last next state) vr_experience_frames = self.experience.sample_sequence( self.local_t_max + 1) # Reverse sequence to calculate from the last vr_experience_frames.reverse() batch_vr_si = [] batch_vr_R = [] batch_vr_last_action_reward = [] vr_R = 0.0 if not vr_experience_frames[1].terminal: vr_R = self.local_network.run_vr_value( sess, vr_experience_frames[0].state, vr_experience_frames[0].get_last_action_reward( self.action_size)) # t_max times loop for frame in vr_experience_frames[1:]: vr_R = frame.reward + self.gamma * vr_R batch_vr_si.append(frame.state) batch_vr_R.append(vr_R) last_action_reward = frame.get_last_action_reward(self.action_size) batch_vr_last_action_reward.append(last_action_reward) batch_vr_si.reverse() batch_vr_R.reverse() batch_vr_last_action_reward.reverse() return batch_vr_si, batch_vr_last_action_reward, batch_vr_R ''' def _process_rp(self): # [Reward prediction] rp_experience_frames, total_raw_reward, _ = self.experience.sample_rp_sequence() # 4 frames batch_rp_si = [] batch_rp_c = [] for i in range(4): batch_rp_si.append(rp_experience_frames[i].state) # one hot vector for target reward r = total_raw_reward rp_c = [0.0, 0.0, 0.0] if r == 0: rp_c[0] = 1.0 # zero elif r > 0: rp_c[1] = 1.0 # positive else: rp_c[2] = 1.0 # negative batch_rp_c.append(rp_c) return batch_rp_si, batch_rp_c ''' def _process_replay(self, action=False): # [Reward prediction] rp_experience_frames, total_raw_reward, next_frame = self.experience.sample_rp_sequence( flag=True) # 4 frames batch_rp_si = [] batch_rp_c = [] for i in range(4): batch_rp_si.append(rp_experience_frames[i].state) # one hot vector for target reward r = total_raw_reward rp_c = [0.0, 0.0, 0.0] if r == 0: rp_c[0] = 1.0 # zero elif r > 0: rp_c[1] = 1.0 # positive else: rp_c[2] = 1.0 # negative batch_rp_c.append(rp_c) result = [batch_rp_si, batch_rp_c, next_frame] if action: batch_rp_action = [] action_index = rp_experience_frames[3].action action_one_hot = np.zeros([self.action_size]) action_one_hot[action_index] = 1.0 batch_rp_action.append(action_one_hot) result.append(batch_rp_action) return result def process(self, sess, global_t, summary_writer, summary_op, score_input): # Fill experience replay buffer if not self.experience.is_full(): self._fill_experience(sess) return 0 start_local_t = self.local_t cur_learning_rate = self._anneal_learning_rate(global_t) # Copy weights from shared to local sess.run(self.sync) # [Base] batch_si, batch_last_action_rewards, batch_a, batch_adv, batch_R, start_lstm_state = \ self._process_base(sess, global_t, summary_writer, summary_op, score_input) feed_dict = { self.local_network.base_input: batch_si, self.local_network.base_last_action_reward_input: batch_last_action_rewards, self.local_network.base_a: batch_a, self.local_network.base_adv: batch_adv, self.local_network.base_r: batch_R, self.local_network.base_initial_lstm_state: start_lstm_state, # [common] self.learning_rate_input: cur_learning_rate } # [Pixel change] if self.use_pixel_change: batch_pc_si, batch_pc_last_action_reward, batch_pc_a, batch_pc_R = self._process_pc( sess) pc_feed_dict = { self.local_network.pc_input: batch_pc_si, self.local_network.pc_last_action_reward_input: batch_pc_last_action_reward, self.local_network.pc_a: batch_pc_a, self.local_network.pc_r: batch_pc_R } feed_dict.update(pc_feed_dict) # [Value replay] if self.use_value_replay: batch_vr_si, batch_vr_last_action_reward, batch_vr_R = self._process_vr( sess) vr_feed_dict = { self.local_network.vr_input: batch_vr_si, self.local_network.vr_last_action_reward_input: batch_vr_last_action_reward, self.local_network.vr_r: batch_vr_R } feed_dict.update(vr_feed_dict) # [Reward prediction] next_frame = None if self.use_reward_prediction: batch_rp_si, batch_rp_c, next_frame = self._process_replay() rp_feed_dict = { self.local_network.rp_input: batch_rp_si, self.local_network.rp_c_target: batch_rp_c } feed_dict.update(rp_feed_dict) # [Future reward prediction] if self.use_future_reward_prediction: batch_frp_si, batch_frp_c, next_frame, batch_frp_action = self._process_replay( action=True) frp_feed_dict = { self.local_network.frp_input: batch_frp_si, self.local_network.frp_c_target: batch_frp_c, self.local_network.frp_action_input: batch_frp_action } feed_dict.update(frp_feed_dict) if next_frame and self.use_autoencoder: ae_feed_dict = { self.local_network.ground_truth: np.expand_dims(next_frame.state, axis=0) } feed_dict.update(ae_feed_dict) # Calculate gradients and copy them to global network. #sess.run( self.apply_gradients, feed_dict=feed_dict) ln = self.local_network if self.use_future_reward_prediction: if self.use_autoencoder: frp_c, decoder_loss, frp_loss, value_loss, policy_loss, _ = sess.run( [ ln.frp_c, ln.decoder_loss, ln.frp_loss, ln.value_loss, ln.policy_loss, self.apply_gradients ], feed_dict=feed_dict) self.add_summary(global_t, 'decoder_loss', decoder_loss, summary_writer) self.add_summary(global_t, 'frp_loss', frp_loss, summary_writer) else: frp_c, value_loss, policy_loss, _ = sess.run( [ ln.frp_c, ln.value_loss, ln.policy_loss, self.apply_gradients ], feed_dict=feed_dict) acc = ((frp_c == frp_c.max()) * batch_frp_c).sum() self.add_summary(global_t, 'reward prediction accuracy', acc, summary_writer) else: value_loss, policy_loss, _ = sess.run( [ln.value_loss, ln.policy_loss, self.apply_gradients], feed_dict=feed_dict) self.add_summary(global_t, 'value_loss', value_loss, summary_writer) self.add_summary(global_t, 'policy_loss', policy_loss, summary_writer) self.add_summary(global_t, 'base_loss', policy_loss + value_loss, summary_writer) if self.use_autoencoder and global_t % 25000 == 0: current_res = { 'next_frame_ground_truth': next_frame, 'step': global_t } if self.use_reward_prediction: predicted_frame, predicted_reward = sess.run( [ self.local_network.encoder_output, self.local_network.rp_c ], feed_dict=feed_dict) current_res['states'] = batch_rp_si current_res['target_reward'] = batch_rp_c elif self.use_future_reward_prediction: predicted_frame, predicted_reward = sess.run( [ self.local_network.encoder_output, self.local_network.frp_c ], feed_dict=feed_dict) current_res['states'] = batch_frp_si current_res['target_reward'] = batch_frp_c current_res['action'] = batch_frp_action current_res['next_frame_prediction'] = predicted_frame current_res['next_reward_prediction'] = predicted_reward if os.path.exists(self.prediction_res_file) and os.path.getsize( self.prediction_res_file) > 0: with open(self.prediction_res_file, 'rb') as f: res = pickle.load(f) else: res = [] res.append(current_res) with open(self.prediction_res_file, 'wb') as f: pickle.dump(res, f) self._print_log(global_t) # Return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t
class Recorder(object): """ Record the actions played by human to accelerate the reinforcement learning """ def __init__(self): self.env = Environment.create_environment() if os.path.exists('human_exp.pkl'): with open('human_exp.pkl', 'r') as f: self.ExpPool = pkl.load(f) else: self.ExpPool = Experience(MAX_EXP) pygame.init() self.surface = pygame.display.set_mode(DISP_SIZE, 0) pygame.display.set_caption('Recorder') def update(self): self.surface.fill(BLACK) obs, reward, terminal, pc, action, depth, v_linear, v_angular = self.process( ) if action != 3: print('linear velocity: ', end='') pprint(v_linear), print('angular velocity: ', end='') pprint(v_angular) #self.record(obs, reward, terminal, pc, action) pygame.display.update() def choose_action(self): action = 3 pressed = pygame.key.get_pressed() if pressed[pygame.K_a]: action = 0 elif pressed[pygame.K_d]: action = 1 elif pressed[pygame.K_w]: action = 2 return action def process(self): action = self.choose_action() obs, reward, terminal, pc, v_linear, v_angular = self.env.process( action) #data = misc.imresize(obs*255.0, DISP_SIZE) data = obs[:, :, :3] * 255.0 image = pygame.image.frombuffer(data.astype(np.uint8), DISP_SIZE, "RGB") depth = obs[:, :, 3] * 255.0 self.surface.blit(image, (0, 0)) if terminal: self.env.reset() return obs, reward, terminal, pc, action, depth, v_linear, v_angular def record(self, obs, reward, terminal, pc, action): last_state = self.env.last_state last_action = self.env.last_action last_reward = self.env.last_reward frame = ExperienceFrame(last_state, reward, action, terminal, pc, last_action, last_reward) self.ExpPool.add_frame(frame) if self.ExpPool.is_full(): print('Experience pool is filled!') print('Filled %d/%d.' % (len(self.ExpPool._frames), MAX_EXP), end='\r') sys.stdout.flush() def save(self): with open('human_exp.pkl', 'w') as f: pkl.dump(self.ExpPool, f)