def _fill_experience(self, sess): """ Fill experience buffer until buffer is full. """ prev_state = self.environment.last_state last_action = self.environment.last_action last_reward = self.environment.last_reward last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward) pi_, _ = self.local_network.run_base_policy_and_value( sess, self.environment.last_state, last_action_reward) action = self.choose_action(pi_) new_state, reward, terminal, pixel_change = self.environment.process( action) frame = ExperienceFrame(prev_state, reward, action, terminal, pixel_change, last_action, last_reward) self.experience.add_frame(frame) if terminal: self.environment.reset() if self.experience.is_full(): self.environment.reset() print("Replay buffer filled")
def _fill_experience(self, sess): """ Fill experience buffer until buffer is full. """ prev_state = self.environment.last_state last_action = self.environment.last_action last_reward = self.environment.last_reward last_intrinsic_reward = self.environment.last_intrinsic_reward last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward) input_map = self.environment.map prev_localization_state, pi_, _, short_term_goal, shift_weights, location_distribution = self.local_network.run_policy_and_value( sess, prev_state, last_action_reward, input_map, replan=False) action = self.choose_action(pi_) new_state, reward, intrinsic_reward, terminal = self.environment.process( action, short_term_goal, shift_weights) frame = ExperienceFrame(prev_state, input_map, prev_localization_state, location_distribution, reward, intrinsic_reward, action, terminal, last_action, last_reward, last_intrinsic_reward) self.experience.add_frame(frame) if terminal: self.level_seed = np.random.randint(LEVEL_SET_SIZE) self.environment.reset(self.maze_size, self.level_seed) if self.experience.is_full(): print( "Replay buffer filled--------------------------------------------------------------------------------------" ) sys.stdout.flush()
def _fill_experience(self, sess): """ Fill experience buffer until buffer is full. """ #print("Start experience filling", flush=True) prev_state = self.environment.last_state last_action = self.environment.last_action last_reward = self.environment.last_reward last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward, prev_state) #print("Local network run base policy, value!", flush=True) pi_, _, _ = self.local_network.run_base_policy_and_value( sess, self.environment.last_state, last_action_reward) action = self.choose_action(pi_) new_state, reward, terminal, pixel_change = self.environment.process( action, flag=0) frame = ExperienceFrame( { key: val for key, val in prev_state.items() if 'objectType' not in key }, reward, action, terminal, pixel_change, last_action, last_reward) self.experience.add_frame(frame) if terminal: self.environment.reset() if self.experience.is_full(): self.environment.reset() print("Replay buffer filled")
def _add_batch_to_exp(self, batch): # if we just started, copy the first state as last state if self.last_state is None: self.last_state = batch.si[0] #logger.debug("adding batch to exp. len:{}".format(len(batch.si))) for k in range(len(batch.si)): state = batch.si[k] action = batch.a[k]#np.argmax(batch.a[k]) reward = batch.a_r[k][-1] self.episode_reward += reward features = batch.features[k] pixel_change = batch.pc[k] #logger.debug("k = {} of {} -- terminal = {}".format(k,len(batch.si), batch.terminal)) if k == len(batch.si)-1 and batch.terminal: terminal = True else: terminal = False frame = ExperienceFrame(state, reward, action, terminal, features, pixel_change, self.last_action, self.last_reward) self.experience.add_frame(frame) self.last_state = state self.last_action = action self.last_reward = reward if terminal: total_ep_reward = self.episode_reward self.episode_reward = 0 return total_ep_reward else: return None
def process(self, sess): self.img = np.zeros(shape=(HEIGHT, WIDTH, 3), dtype=np.uint8) last_action = self.env.last_action last_reward = np.clip(self.env.last_reward, -1, 1) last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward) if not USE_PIXEL_CHANGE: pi_values, v_value = self.global_network.run_base_policy_and_value( sess, self.env.last_state, last_action_reward) else: pi_values, v_value, pc_q = self.global_network.run_base_policy_value_pc_q( sess, self.env.last_state, last_action_reward) self.value_history.add_value(v_value) action = self.choose_action(pi_values) state, reward, terminal, pc, vtrans, vrot = self.env.process(action) self.state_history.add_state(state) self.ep_reward += reward self.mazemap.update(vtrans, vrot) if reward > 9: # agent到达迷宫终点时,reward为10,地图需要重置 self.mazemap.reset() if terminal: # lab环境默认3600帧为一个episode而不是到达迷宫终点时给terminal信号 self.env.reset() self.ep_reward = 0 self.mazemap.reset() self.show_ob(state, 3, 3, "Observation") self.show_pc(pc, 100, 3, 3.0, "Pixel Change") self.show_pc(pc_q[:, :, action], 200, 3, 0.4, "PC Q") self.show_map(300, 3, "Maze Map") self.show_pi(pi_values) self.show_reward() self.show_rp() self.show_value()
def process(self, sess, global_t, summary_writer, summary_op, score_input): """ TODO """ self.environment.reset() for ep in range(self.environment.num_episodes): print("starting episode number {}!".format(ep)) terminal = False while not terminal: # Prepare last action reward last_action = self.environment.last_action last_reward = self.environment.last_reward last_action_reward = ExperienceFrame.concat_action_and_reward(last_action, self.action_size, last_reward) _last_state = self.environment.last_state pi_, value_ = self.local_network.run_base_policy_and_value(sess, self.environment.last_state, last_action_reward) action = self.choose_action(pi_) # Process game new_state, reward, terminal, pixel_change = self.environment.process(action) self.episode_reward += reward if terminal: print("score={}".format(self.episode_reward)) self.episode_reward = 0 self.environment.reset() self.local_network.reset_state() break self.environment.env.close()
def record(self, obs, reward, terminal, pc, action): last_state = self.env.last_state last_action = self.env.last_action last_reward = self.env.last_reward frame = ExperienceFrame(last_state, reward, action, terminal, pc, last_action, last_reward) self.ExpPool.add_frame(frame) if self.ExpPool.is_full(): print('Experience pool is filled!') print('Filled %d/%d.' % (len(self.ExpPool._frames), MAX_EXP), end='\r') sys.stdout.flush()
def process(self, sess): last_action = self.environment.last_action last_reward = np.clip(self.environment.last_reward, -1, 1) last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward) if not USE_PIXEL_CHANGE: pi_values, v_value = self.global_network.run_base_policy_and_value( sess, self.environment.last_state, last_action_reward) else: pi_values, v_value, pc_q = self.global_network.run_base_policy_value_pc_q( sess, self.environment.last_state, last_action_reward) self.value_history.add_value(v_value) action = self.choose_action(pi_values) state, reward, terminal, pixel_change, vtrans, vrot = self.environment.process( action) self.episode_reward += reward self.mazemap.update(vtrans, vrot) if reward > 9: self.mazemap.reset() if terminal: self.environment.reset() self.episode_reward = 0 self.mazemap.reset() self.show_image(state[:, :, :3]) self.show_policy(pi_values) self.show_value() self.show_reward() self.show_map() if USE_PIXEL_CHANGE: self.show_pixel_change(pixel_change, 100, 0, 3.0, "PC") self.show_pixel_change(pc_q[:, :, action], 200, 0, 0.4, "PC Q") if USE_REWARD_PREDICTION: if self.state_history.is_full: rp_c = self.global_network.run_rp_c(sess, self.state_history.states) self.show_reward_prediction(rp_c, reward) self.state_history.add_state(state)
def process(self, sess): last_action = self.environment.last_action last_reward = np.clip(self.environment.last_reward, -1, 1) last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward, self.environment.last_state) if not flags.use_pixel_change: pi_values, v_value = self.global_network.run_base_policy_and_value( sess, self.environment.last_state, last_action_reward) else: pi_values, v_value, pc_q = self.global_network.run_base_policy_value_pc_q( sess, self.environment.last_state, last_action_reward) action = self.choose_action(pi_values) state, reward, terminal, pixel_change = self.environment.process( action) self.episode_reward += reward if terminal: self.environment.reset() self.episode_reward = 0
def process(self, sess): last_action = self.environment.last_action last_reward = np.clip(self.environment.last_reward, -1, 1) last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward, self.environment.last_state) if not flags.use_pixel_change: pi_values, v_value = self.global_network.run_base_policy_and_value( sess, self.environment.last_state, last_action_reward) else: pi_values, v_value, pc_q = self.global_network.run_base_policy_value_pc_q( sess, self.environment.last_state, last_action_reward) self.value_history.add_value(v_value) action = self.choose_action(pi_values) state, reward, terminal, pixel_change = self.environment.process( action) self.episode_reward += reward if terminal: self.environment.reset() self.episode_reward = 0 self.show_image(state['image']) self.show_policy(pi_values) self.show_value() self.show_reward() if flags.use_pixel_change: self.show_pixel_change(pixel_change, 100, 0, 3.0, "PC") self.show_pixel_change(pc_q[:, :, action], 200, 0, 0.4, "PC Q") if flags.use_reward_prediction: if self.state_history.is_full: rp_c = self.global_network.run_rp_c(sess, self.state_history.states) self.show_reward_prediction(rp_c, reward) self.state_history.add_state(state)
def process(self, sess): last_action = self.environment.last_action last_reward = self.environment.last_reward last_action_reward = ExperienceFrame.concat_action_and_reward(last_action, self.action_size, last_reward, self.environment.last_state) if random_policy: pi_values = [1/3.0, 1/3.0, 1/3.0] action = self.choose_action(pi_values) state, reward, terminal, pixel_change = self.environment.process(action) self.episode_reward[-1] += reward else: mode = "segnet" if flags.segnet >= 2 else "" segnet_preds = None if not flags.use_pixel_change: pi_values, v_value, segnet_preds = self.global_network.run_base_policy_and_value(sess, self.environment.last_state, last_action_reward, mode=mode) else: pi_values, v_value, pc_q = self.global_network.run_base_policy_value_pc_q(sess, self.environment.last_state, last_action_reward) if segnet_preds is not None: mask = self.environment.last_state.get('objectType', None) if mask is not None: new_classes = np.unique(mask) if segnet_preds.shape != mask.shape: print("Predictions have shape {}, but groundtruth mask has shape {}".format(segnet_preds.shape, mask.shape)) else: similar = segnet_preds == mask for id_class in new_classes: id_list = self.segnet_class_dict.get(id_class, None) if id_list is None: id_list = [] id_list += [[np.sum(similar[mask == id_class]), np.sum(mask == id_class)]] self.segnet_class_dict[id_class] = id_list self.batch_cur_num += 1 if flags.segnet == -1: #just not necessary if self.batch_cur_num != 0 and self.batch_cur_num - self.batch_prev_num >= self.batch_size: #print(np.unique(self.batch_sobjT)) feed_dict = {self.global_network.base_input: self.batch_si, self.global_network.base_segm_mask: self.batch_sobjT, self.global_network.is_training: not True} segm_loss, preds, confusion_mtx = sess.run([self.global_network.decoder_loss, self.global_network.preds, self.global_network.update_evaluation_vars], feed_dict=feed_dict) total_loss = 0 self.total_loss += [total_loss] self.segm_loss += [segm_loss] # TODO: here do something with it, store somwhere? #update every_thing else self.batch_prev_num = self.batch_cur_num self.batch_si = [] self.batch_sobjT = [] self.batch_a = [] else: self.batch_si += [self.environment.last_state["image"]] self.batch_sobjT += [self.environment.last_state["objectType"]] self.batch_a += [self.environment.ACTION_LIST[self.environment.last_action]] action = self.choose_action(pi_values) state, reward, terminal, pixel_change = self.environment.process(action) self.episode_reward[-1] += reward if terminal: ep_info = self.environment._episode_info if ep_info['task'] == 'room_goal': one_hot_room = ep_info['goal']['roomTypeEncoded'] room_type = ep_info['goal']['roomType'] ind = np.where(one_hot_room)[0][0] self.roomType_dict[ind] = room_type self.episode_roomtype += [ind] self.success_rate += [int(self.environment._last_full_state["success"])] self.environment.reset() self.episode_reward += [0]
def _process_base(self, sess, global_t, summary_writer, summary_op, score_input): # [Base A3C] states = [] last_action_rewards = [] actions = [] rewards = [] values = [] terminal_end = False start_lstm_state = self.local_network.base_lstm_state_out # t_max times loop for _ in range(self.local_t_max): # Prepare last action reward last_action = self.environment.last_action last_reward = self.environment.last_reward last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward) #Modify Last State - with attention pi_, value_ = self.local_network.run_base_policy_and_value( sess, self.environment.last_state, last_action_reward) action = self.choose_action(pi_) states.append(self.environment.last_state) last_action_rewards.append(last_action_reward) actions.append(action) values.append(value_) if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0): print("pi={}".format(pi_)) print(" V={}".format(value_)) prev_state = self.environment.last_state # Process game new_state, reward, terminal, pixel_change = self.environment.process( action) #Modify New State - with attention frame = ExperienceFrame(prev_state, reward, action, terminal, pixel_change, last_action, last_reward) # Store to experience self.experience.add_frame(frame) self.episode_reward += reward rewards.append(reward) self.local_t += 1 if terminal: terminal_end = True print("score={}".format(self.episode_reward)) self._record_score(sess, summary_writer, summary_op, score_input, self.episode_reward, global_t) self.episode_reward = 0 self.environment.reset() self.local_network.reset_state() break R = 0.0 if not terminal_end: R = self.local_network.run_base_value( sess, new_state, frame.get_action_reward(self.action_size)) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_a = [] batch_adv = [] batch_R = [] for (ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + self.gamma * R adv = R - Vi a = np.zeros([self.action_size]) a[ai] = 1.0 batch_si.append(si) batch_a.append(a) batch_adv.append(adv) batch_R.append(R) batch_si.reverse() batch_a.reverse() batch_adv.reverse() batch_R.reverse() return batch_si, last_action_rewards, batch_a, batch_adv, batch_R, start_lstm_state
def process(self, sess): last_action_reward = ExperienceFrame.concat_action_and_reward(self.environment.last_action, self.action_size, self.environment.last_reward) map_input = self.environment.map pi_values, v_value, location,angle,value_map,reward_map,short_term_goal,angle_neurons, local_map_prediction, \ local_map, actual_local_map, vlm_target,vlm_prediction,location_estimate,shift_weights = self.global_network.run_display_values(sess, self.environment.last_state, last_action_reward, map_input, self.replan) if self.replan: self.path = [] self.step_count = 0 self.episode_reward = 0 self.episode_intrinsic_reward = 0 self.replan = False self.value_history.add_value(v_value) action = self.choose_action(pi_values) state, reward, intrinsic_reward, terminal = self.environment.process(action, short_term_goal, shift_weights) self.replan = False if terminal: print('Steps needed: ', self.step_count) sys.stdout.flush() self.environment.reset(DISPLAY_LEVEL[0],np.random.randint(LEVEL_SET_SIZE)) self.global_network.reset_state() self.replan = True self.episode_reward += reward self.episode_intrinsic_reward += intrinsic_reward self.step_count += 1 self.show_image(self.state) self.show_angle(angle) self.show_pixels(np.reshape(angle_neurons,[1,30]),370, 176, 4, 1, "Discretized Angle") self.show_pixels(np.reshape(shift_weights,[3,3]),400, 250, 20, 1, "Egomotion Estimation") self.show_pixels(vlm_target,550, 8, 5, 1, "Visible Local Map Target",True) self.show_pixels(vlm_prediction,550, 176, 5, 1, "Visible Local Map Estimation",True) self.show_pixels(actual_local_map,725, 8, 5, 1, "Local Map Target",True) self.show_pixels(local_map_prediction,725, 176, 5, 1, "Local Map Estimation",True) self.show_pixels(local_map,900, 8, 5, 1, "Map Feedback Local Map",True) self.draw_text("Estimated Position: " + str(np.around(location_estimate)), 900, 220) self.draw_text("Actual Position: " + str(np.asarray(self.state['position'][2], 'float')), 900, 240) self.draw_text("STEPS: {}".format(int(self.step_count)), 900, 260) self.draw_text("REWARD: {}".format(float(self.episode_reward)), 900, 280) self.draw_text("INTRINSIC REWARD: {}".format(float(self.episode_intrinsic_reward)), 900, 300) disp_map = np.reshape(map_input, [126, 126,1]) self.show_map(disp_map,8,400,3,1,"Map",location,self.state['position'][1]) self.show_map(self.scale_image(reward_map, 2), 400, 400, 3, 1, "Reward Map, R = 0, G = +, B = -") stg = np.asarray([[0, short_term_goal[2], 0], [short_term_goal[3], short_term_goal[4], short_term_goal[1]], [0, short_term_goal[0], 0]]) self.show_pixels(stg, 840, 400, 20, 1, "Short Term") self.draw_center_text("Target Direction", 870, 490) rp_c = self.global_network.run_map_rp_c(sess, self.state, state, map_input) self.show_reward_prediction(rp_c, reward, 820, 600, "Reward Prediction") self.show_policy(pi_values,action) self.show_value() self.state = state time.sleep(DISPLAY_SLOW_DOWN)
def process(self, sess): sess.run([ tf.global_variables_initializer(), tf.local_variables_initializer() ]) #sess.run(tf.initialize_all_variables()) last_action = self.environment.last_action last_reward = self.environment.last_reward last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward, self.environment.last_state) preds = None mode = "segnet" if flags.segnet >= 2 else "" mode = "" #don't want preds if not flags.use_pixel_change: pi_values, v_value, preds = self.global_network.run_base_policy_and_value( sess, self.environment.last_state, last_action_reward, mode=mode) else: pi_values, v_value, pc_q = self.global_network.run_base_policy_value_pc_q( sess, self.environment.last_state, last_action_reward) #print(preds) self.value_history.add_value(v_value) prev_state = self.environment.last_state action = self.choose_action(pi_values) state, reward, terminal, pixel_change = self.environment.process( action) self.episode_reward += reward if terminal: self.environment.reset() self.episode_reward = 0 self.show_image(state['image']) self.show_policy(pi_values) self.show_value() self.show_reward() if not flags.use_pixel_change: if preds is not None: self.show_pixel_change(self.label_to_rgb(preds), 100, 0, 3.0, "Preds") self.show_pixel_change(self.label_to_rgb(state['objectType']), 200, 0, 0.4, "Segm Mask") else: self.show_pixel_change(pixel_change, 100, 0, 3.0, "PC") self.show_pixel_change(pc_q[:, :, action], 200, 0, 0.4, "PC Q") if flags.use_reward_prediction: if self.state_history.is_full: rp_c = self.global_network.run_rp_c(sess, self.state_history.states) self.show_reward_prediction(rp_c, reward) self.state_history.add_state(state)
def _process_base(self, sess, global_t, summary_writer, summary_op_dict, summary_dict): #, losses_input): # [Base A3C] states = [] last_action_rewards = [] actions = [] rewards = [] values = [] terminal_end = False start_lstm_state = None if self.use_lstm: start_lstm_state = self.local_network.base_lstm_state_out mode = "segnet" if self.segnet_mode >= 2 else "" # t_max times loop flag = 0 for _ in range(self.n_step_TD): # Prepare last action reward last_action = self.environment.last_action last_reward = self.environment.last_reward last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward, self.environment.last_state) pi_, value_, losses = self.local_network.run_base_policy_and_value( sess, self.environment.last_state, last_action_reward, mode) action = self.choose_action(pi_) states.append(self.environment.last_state) last_action_rewards.append(last_action_reward) actions.append(action) values.append(value_) if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0): print("Trainer {}>>> Local step {}:".format( self.thread_index, self.local_t)) print("Trainer {}>>> pi={}".format(self.thread_index, pi_)) print("Trainer {}>>> V={}".format(self.thread_index, value_)) flag = 1 prev_state = self.environment.last_state # Process game new_state, reward, terminal, pixel_change = self.environment.process( action, flag=flag) frame = ExperienceFrame( { key: val for key, val in prev_state.items() if 'objectType' not in key }, reward, action, terminal, pixel_change, last_action, last_reward) # Store to experience self.experience.add_frame(frame) # Use to know about Experience collection #print(self.experience.get_debug_string()) self.episode_reward += reward rewards.append(reward) self.local_t += 1 if terminal: terminal_end = True print("Trainer {}>>> score={}".format( self.thread_index, self.episode_reward)) #, flush=True) summary_dict['values'].update( {'score_input': self.episode_reward}) success = 1 if self.environment._last_full_state[ "success"] else 0 #print("Type:", type(self.environment._last_full_state["success"]), len(self.success_rates), success) self.success_rates.append(success) summary_dict['values'].update({ 'sr_input': np.mean(self.success_rates) if len(self.success_rates) == self.sr_size else 0 }) self.episode_reward = 0 self.environment.reset() self.local_network.reset_state() if flag: flag = 0 break R = 0.0 if not terminal_end: R = self.local_network.run_base_value( sess, new_state, frame.get_action_reward(self.action_size)) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_a = [] batch_adv = [] batch_R = [] batch_sobjT = [] for (ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + self.gamma * R adv = R - Vi a = np.zeros([self.action_size]) a[ai] = 1.0 batch_si.append(si['image']) batch_a.append(a) batch_adv.append(adv) batch_R.append(R) if self.segnet_param_dict["segnet_mode"] >= 2: batch_sobjT.append(si['objectType']) batch_si.reverse() batch_a.reverse() batch_adv.reverse() batch_R.reverse() batch_sobjT.reverse() #print(np.unique(batch_sobjT)) ## HERE Mathematical Error A3C: only last values should be used for base/ or aggregate with last made return batch_si, batch_sobjT, last_action_rewards, batch_a, batch_adv, batch_R, start_lstm_state
def _process_base(self, sess, global_t, summary_writer, summary_op, score_input, average_entropy): # [Base A3C] states = [] map_states = [] last_action_rewards = [] actions = [] rewards = [] values = [] episode_entropy = 0.0 episode_steps = 0 terminal_end = False start_lstm_state = self.local_network.base_lstm_state_out # t_max times loop for _ in range(LOCAL_T_MAX): # Prepare last action reward last_action = self.environment.last_action last_reward = self.environment.last_reward last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward) prev_map_state = self.mazemap.get_map(84, 84) pi_, value_ = self.local_network.run_base_policy_and_value( sess, self.environment.last_state, prev_map_state, last_action_reward) action = self.choose_action(pi_) states.append(self.environment.last_state) map_states.append(prev_map_state) last_action_rewards.append(last_action_reward) actions.append(action) values.append(value_) if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0): print("pi={}".format(pi_)) print(" V={}".format(value_)) prev_state = self.environment.last_state # Process game new_state, reward, terminal, pixel_change, vtrans, vrot = self.environment.process( action) self.mazemap.update(vtrans, vrot) if reward > 9: self.mazemap.reset() frame = ExperienceFrame(prev_state, prev_map_state, reward, action, terminal, pixel_change, last_action, last_reward) # Store to experience self.experience.add_frame(frame) self.episode_reward += reward episode_entropy += np.sum(pi_ * np.log(pi_)) episode_steps += 1 rewards.append(reward) self.local_t += 1 if terminal: terminal_end = True print("score={}".format(self.episode_reward)) self._record_score(sess, summary_writer, summary_op, score_input, self.episode_reward, average_entropy, episode_entropy / episode_steps, global_t) self.episode_reward = 0 episode_entropy = 0.0 episode_steps = 0 self.environment.reset() self.mazemap.reset() self.local_network.reset_state() break R = 0.0 if not terminal_end: R = self.local_network.run_base_value( sess, new_state, self.mazemap.get_map(84, 84), frame.get_last_action_reward(self.action_size)) actions.reverse() states.reverse() map_states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_mi = [] batch_a = [] batch_adv = [] batch_R = [] for (ai, ri, si, mi, Vi) in zip(actions, rewards, states, map_states, values): R = ri + GAMMA * R adv = R - Vi a = np.zeros([self.action_size]) a[ai] = 1.0 batch_si.append(si) batch_mi.append(mi) batch_a.append(a) batch_adv.append(adv) batch_R.append(R) batch_si.reverse() batch_mi.reverse() batch_a.reverse() batch_adv.reverse() batch_R.reverse() return batch_si, batch_mi, last_action_rewards, batch_a, batch_adv, batch_R, start_lstm_state
def _add_frame(self, experience, reward): frame = ExperienceFrame(0, reward, 0, False, 0, 0, 0) experience.add_frame(frame)
def _process_base(self, sess, global_t, map_input): # [Base A3C] states = [] actions = [] batch_last_action_rewards = [] rewards = [] values = [] terminal_end = False replan = (self.apply_next_location_loss == 0.0) start_localization_state = self.local_network.localization_state_out # t_max times loop for _ in range(LOCAL_T_MAX): self.local_t += 1 # Previous state prev_state = self.environment.last_state last_action = self.environment.last_action last_reward = self.environment.last_reward last_intrinsic_reward = self.environment.last_intrinsic_reward last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward) prev_localization_state, pi_, value_, short_term_goal, shift_weights, location_distribution = self.local_network.run_policy_and_value( sess, prev_state, last_action_reward, map_input, replan) replan = False action = self.choose_action(pi_) states.append(prev_state) actions.append( ExperienceFrame.get_action_neurons(action, self.action_size)) batch_last_action_rewards.append(last_action_reward) values.append(value_) if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0): print("pi={}".format(pi_)) print(" V={}".format(value_)) # Process game new_state, reward, intrinsic_reward, terminal = self.environment.process( action, short_term_goal, shift_weights) frame = ExperienceFrame(prev_state, map_input, prev_localization_state, location_distribution, reward, intrinsic_reward, action, terminal, last_action, last_reward, last_intrinsic_reward) # Store to experience self.experience.add_frame(frame) self.episode_reward += reward + intrinsic_reward rewards.append(reward + intrinsic_reward) if terminal: terminal_end = True if reward > 0: self.correct_exits += 1 steps_needed = self.local_t - self.last_terminal_local_t self.last_terminal_local_t = self.local_t self.steps_buffer.append(steps_needed) if len(self.steps_buffer) > 50: self.steps_buffer.popleft() print("Steps needed: ", steps_needed) print("score={}".format(self.episode_reward)) self.episode_reward = 0 if (np.mean(self.steps_buffer) < 100 + (self.maze_size - 7) * 20 and len(self.steps_buffer) == 50): self.maze_size += 2 if self.maze_size > 13: print(">>>>>>>>>>> REACHED END <<<<<<<<<<<") self.environment.stop() sys.stdout.flush() self.running = False break print(">>>>>> SWITCHING TO MAZES OF SIZE ", self.maze_size, "x", self.maze_size, " AT GLOBAL T ", global_t, " <<<<<<<<<<<<<<<") sys.stdout.flush() #reset moving average self.correct_exits = 0 self.steps_buffer = deque() self.level_seed = np.random.randint(LEVEL_SET_SIZE) self.environment.reset(self.maze_size, self.level_seed) self.local_network.reset_state() break last_action_reward = ExperienceFrame.concat_action_and_reward( action, self.action_size, reward) R = 0.0 if not terminal_end: R = self.local_network.run_value(sess, new_state, last_action_reward, frame.map) self.apply_next_location_loss = 1.0 else: self.apply_next_location_loss = 0.0 states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_adv = [] batch_R = [] for (ri, si, Vi) in zip(rewards, states, values): R = ri + GAMMA * R adv = R - Vi batch_si.append(si) batch_adv.append(adv) batch_R.append(R) batch_si.reverse() batch_adv.reverse() batch_R.reverse() return batch_si, batch_last_action_rewards, actions, batch_adv, batch_R, start_localization_state