def _process_base(self, sess, global_t, summary_writer, summary_op, score_input): # [Base A3C] states = [] last_action_rewards = [] actions = [] rewards = [] values = [] terminal_end = False start_lstm_state = self.local_network.base_lstm_state_out # t_max times loop for _ in range(self.local_t_max): # Prepare last action reward last_action = self.environment.last_action last_reward = self.environment.last_reward last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward) #Modify Last State - with attention pi_, value_ = self.local_network.run_base_policy_and_value( sess, self.environment.last_state, last_action_reward) action = self.choose_action(pi_) states.append(self.environment.last_state) last_action_rewards.append(last_action_reward) actions.append(action) values.append(value_) if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0): print("pi={}".format(pi_)) print(" V={}".format(value_)) prev_state = self.environment.last_state # Process game new_state, reward, terminal, pixel_change = self.environment.process( action) #Modify New State - with attention frame = ExperienceFrame(prev_state, reward, action, terminal, pixel_change, last_action, last_reward) # Store to experience self.experience.add_frame(frame) self.episode_reward += reward rewards.append(reward) self.local_t += 1 if terminal: terminal_end = True print("score={}".format(self.episode_reward)) self._record_score(sess, summary_writer, summary_op, score_input, self.episode_reward, global_t) self.episode_reward = 0 self.environment.reset() self.local_network.reset_state() break R = 0.0 if not terminal_end: R = self.local_network.run_base_value( sess, new_state, frame.get_action_reward(self.action_size)) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_a = [] batch_adv = [] batch_R = [] for (ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + self.gamma * R adv = R - Vi a = np.zeros([self.action_size]) a[ai] = 1.0 batch_si.append(si) batch_a.append(a) batch_adv.append(adv) batch_R.append(R) batch_si.reverse() batch_a.reverse() batch_adv.reverse() batch_R.reverse() return batch_si, last_action_rewards, batch_a, batch_adv, batch_R, start_lstm_state
def _process_base(self, sess, global_t, summary_writer, summary_op_dict, summary_dict): #, losses_input): # [Base A3C] states = [] last_action_rewards = [] actions = [] rewards = [] values = [] terminal_end = False start_lstm_state = None if self.use_lstm: start_lstm_state = self.local_network.base_lstm_state_out mode = "segnet" if self.segnet_mode >= 2 else "" # t_max times loop flag = 0 for _ in range(self.n_step_TD): # Prepare last action reward last_action = self.environment.last_action last_reward = self.environment.last_reward last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward, self.environment.last_state) pi_, value_, losses = self.local_network.run_base_policy_and_value( sess, self.environment.last_state, last_action_reward, mode) action = self.choose_action(pi_) states.append(self.environment.last_state) last_action_rewards.append(last_action_reward) actions.append(action) values.append(value_) if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0): print("Trainer {}>>> Local step {}:".format( self.thread_index, self.local_t)) print("Trainer {}>>> pi={}".format(self.thread_index, pi_)) print("Trainer {}>>> V={}".format(self.thread_index, value_)) flag = 1 prev_state = self.environment.last_state # Process game new_state, reward, terminal, pixel_change = self.environment.process( action, flag=flag) frame = ExperienceFrame( { key: val for key, val in prev_state.items() if 'objectType' not in key }, reward, action, terminal, pixel_change, last_action, last_reward) # Store to experience self.experience.add_frame(frame) # Use to know about Experience collection #print(self.experience.get_debug_string()) self.episode_reward += reward rewards.append(reward) self.local_t += 1 if terminal: terminal_end = True print("Trainer {}>>> score={}".format( self.thread_index, self.episode_reward)) #, flush=True) summary_dict['values'].update( {'score_input': self.episode_reward}) success = 1 if self.environment._last_full_state[ "success"] else 0 #print("Type:", type(self.environment._last_full_state["success"]), len(self.success_rates), success) self.success_rates.append(success) summary_dict['values'].update({ 'sr_input': np.mean(self.success_rates) if len(self.success_rates) == self.sr_size else 0 }) self.episode_reward = 0 self.environment.reset() self.local_network.reset_state() if flag: flag = 0 break R = 0.0 if not terminal_end: R = self.local_network.run_base_value( sess, new_state, frame.get_action_reward(self.action_size)) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_a = [] batch_adv = [] batch_R = [] batch_sobjT = [] for (ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + self.gamma * R adv = R - Vi a = np.zeros([self.action_size]) a[ai] = 1.0 batch_si.append(si['image']) batch_a.append(a) batch_adv.append(adv) batch_R.append(R) if self.segnet_param_dict["segnet_mode"] >= 2: batch_sobjT.append(si['objectType']) batch_si.reverse() batch_a.reverse() batch_adv.reverse() batch_R.reverse() batch_sobjT.reverse() #print(np.unique(batch_sobjT)) ## HERE Mathematical Error A3C: only last values should be used for base/ or aggregate with last made return batch_si, batch_sobjT, last_action_rewards, batch_a, batch_adv, batch_R, start_lstm_state