def finalizeRecord(self, reward, domainInControl=None): if domainInControl is None: domainInControl = self.domainString if self.episodes[domainInControl] is None: self.logger.warning("record attempted to be finalized for domain where nothing has been recorded before") return # print 'Episode Avg_Max_Q', float(self.episode_ave_max_q)/float(self.episodes[domainInControl].size()) #print 'Episode Avg_Max_Q', np.mean(self.episode_ave_max_q) #print self.stats # normalising total return to -1~1 reward /= 20.0 terminal_state, terminal_action = self.convertStateAction(TerminalState(), TerminalAction()) if self.replay_type == 'vanilla': self.episodes[domainInControl].record(state=terminal_state, \ state_ori=TerminalState(), action=terminal_action, reward=reward, terminal=True) elif self.replay_type == 'prioritized': self.episodes[domainInControl].record(state=terminal_state, \ state_ori=TerminalState(), action=terminal_action, reward=reward, \ Q_s_t_a_t_=0.0, gamma_Q_s_tplu1_maxa_=0.0, uniform=False, terminal=True)
def finalizeRecord(self, reward, domainInControl=None): if domainInControl is None: domainInControl = self.domainString if self.episodes[domainInControl] is None: logger.warning( "record attempted to be finalized for domain where nothing has been recorded before" ) return # normalising total return to -1~1 reward /= 20.0 terminal_state, terminal_action = self.convertStateAction( TerminalState(), TerminalAction()) if self.replay_type == 'vanilla': self.episodes[domainInControl].record(state=terminal_state, \ state_ori=TerminalState(), action=terminal_action, reward=reward, terminal=True) elif self.replay_type == 'prioritized': # heuristically assign 0.0 to Q_s_t_a_t_ and Q_s_tplu1_maxa_, doesn't matter as it is not used if True: # if self.samplecount >= self.capacity: self.episodes[domainInControl].record(state=terminal_state, \ state_ori=TerminalState(), action=terminal_action, reward=reward, \ Q_s_t_a_t_=0.0, gamma_Q_s_tplu1_maxa_=0.0, uniform=False, terminal=True) else: self.episodes[domainInControl].record(state=terminal_state, \ state_ori=TerminalState(), action=terminal_action, reward=reward, \ Q_s_t_a_t_=0.0, gamma_Q_s_tplu1_maxa_=0.0, uniform=True, terminal=True) return
def finalizeRecord(self, reward, domainInControl=None): if domainInControl is None: domainInControl = self.domainString if self.episodes[domainInControl] is None: logger.warning( "record attempted to be finalized for domain where nothing has been recorded before" ) return #print 'Episode Avg_Max_Q', float(self.episode_ave_max_q)/float(self.episodes[domainInControl].size()) #print 'Episode Avg_Max_Q', np.mean(self.episode_ave_max_q) #print self.stats # normalising total return to -1~1 reward /= 20.0 terminal_state, terminal_action = self.convertStateAction( TerminalState(), TerminalAction()) value = 0.0 # not effect on experience replay def calculate_discountR_advantage(r_episode, v_episode): ######################################################################### # Here we take the rewards and values from the rollout, and use them to # generate the advantage and discounted returns. # The advantage function uses "Generalized Advantage Estimation" bootstrap_value = 0.0 self.r_episode_plus = np.asarray(r_episode + [bootstrap_value]) discounted_r_episode = discount(self.r_episode_plus, self.gamma)[:-1] self.v_episode_plus = np.asarray(v_episode + [bootstrap_value]) advantage = r_episode + self.gamma * self.v_episode_plus[ 1:] - self.v_episode_plus[:-1] advantage = discount(advantage, self.gamma) ######################################################################### return discounted_r_episode, advantage if self.replay_type == 'vanilla': self.episodes[domainInControl].record(state=terminal_state, \ state_ori=TerminalState(), action=terminal_action, reward=reward, value=value, terminal=True, distribution=None) elif self.replay_type == 'prioritized': episode_r, episode_v = self.episodes[domainInControl].record_final_and_get_episode(state=terminal_state, \ state_ori=TerminalState(), action=terminal_action, reward=reward, value=value) # TD_error is a list of td error in the current episode _, TD_error = calculate_discountR_advantage(episode_r, episode_v) episodic_TD = np.mean(np.absolute(TD_error)) print 'episodic_TD' print episodic_TD self.episodes[domainInControl].insertPriority(episodic_TD) return
def finalizeRecord(self, reward, domainInControl=None): if domainInControl is None: domainInControl = self.domainString if self.episodes[domainInControl] is None: logger.warning( "record attempted to be finalized for domain where nothing has been recorded before" ) return # print 'Episode Avg_Max_Q', float(self.episode_ave_max_q)/float(self.episodes[domainInControl].size()) #print 'Episode Avg_Max_Q', np.mean(self.episode_ave_max_q) #print 'saving statics' #self.saveStats() #print self.stats #print 'stdVar' #print self.stdVar #print 'meanVar' #print self.meanVar #print 'stdMean' #print self.stdMean #print 'meanMean' #print self.meanMean # print 'td_error' # print self.td_error # print 'td_errorVar' # print self.td_errorVar # normalising total return to -1~1 reward /= 20.0 terminal_state, terminal_action = self.convertStateAction( TerminalState(), TerminalAction()) if self.replay_type == 'vanilla': self.episodes[domainInControl].record(state=terminal_state, \ state_ori=TerminalState(), action=terminal_action, reward=reward, terminal=True) elif self.replay_type == 'prioritized': # heuristically assign 0.0 to Q_s_t_a_t_ and Q_s_tplu1_maxa_, doesn't matter as it is not used if True: # if self.samplecount >= self.capacity: self.episodes[domainInControl].record(state=terminal_state, \ state_ori=TerminalState(), action=terminal_action, reward=reward, \ Q_s_t_a_t_=0.0, gamma_Q_s_tplu1_maxa_=0.0, uniform=False, terminal=True) else: self.episodes[domainInControl].record(state=terminal_state, \ state_ori=TerminalState(), action=terminal_action, reward=reward, \ Q_s_t_a_t_=0.0, gamma_Q_s_tplu1_maxa_=0.0, uniform=True, terminal=True) return
def finalizeRecord(self, reward, domainInControl=None): if domainInControl is None: domainInControl = self.domainString if self.episodes[domainInControl] is None: logger.warning( "record attempted to be finalized for domain where nothing has been recorded before" ) return # # normalising total return to -1~1 # reward /= 20.0 terminal_state, terminal_action = self.convertStateAction( TerminalState(), TerminalAction()) # # normalising total return to -1~1 # reward /= 20.0 self.mem_last_state = self.mem_cur_state self.mem_last_action = self.mem_cur_action self.mem_last_mask = self.mem_cur_mask self.mem_cur_state = np.vstack( [np.expand_dims(x, 0) for x in [terminal_state]]) self.mem_cur_action = None self.mem_cur_mask = torch.zeros(self.action_dim).type(FloatTensor) state = self.mem_last_state action = self.mem_last_action next_state = self.mem_cur_state terminal = True if state is not None: self.trans_mem.append( self.trans( torch.from_numpy(state).type(FloatTensor), # state action, # action torch.from_numpy(next_state).type( FloatTensor), # next state torch.from_numpy(reward).type(FloatTensor), # reward terminal, # terminal self.mem_last_mask, # action mask self.mem_cur_mask)) # next action mask # randomly produce a preference for calculating priority # preference = self.w_kept preference = torch.randn(self.model_.reward_size) preference = (torch.abs(preference) / torch.norm(preference, p=1)).type(FloatTensor) state = torch.from_numpy(state).type(FloatTensor) _, q = self.model_( Variable(state, requires_grad=False), Variable(preference.unsqueeze(0), requires_grad=False)) q = q.data[0, action] if self.algorithm == 'naive': wr = preference.dot(torch.from_numpy(reward).type(FloatTensor)) if not terminal: next_state = torch.from_numpy(next_state).type(FloatTensor) hq, _ = self.model_( Variable(next_state, requires_grad=False), Variable(preference.unsqueeze(0), requires_grad=False)) hq = hq.data[0] p = abs(wr + self.gamma * hq - q) else: self.w_kept = None # if self.epsilon_decay: # self.epsilon -= self.epsilon_delta p = abs(wr - q) elif self.algorithm == 'envelope': wq = preference.dot(q) wr = preference.dot(torch.from_numpy(reward).type(FloatTensor)) if not terminal: next_state = torch.from_numpy(next_state).type(FloatTensor) hq, _ = self.model_( Variable(next_state, requires_grad=False), Variable(preference.unsqueeze(0), requires_grad=False)) hq = hq.data[0] whq = preference.dot(hq) p = abs(wr + self.gamma * whq - wq) else: self.w_kept = None # if self.epsilon_decay: # self.epsilon -= self.epsilon_delta # if self.homotopy: # self.beta += self.beta_delta # self.beta_delta = (self.beta - self.beta_init) * self.beta_expbase + self.beta_init - self.beta p = abs(wr - wq) p += 1e-5 self.priority_mem.append(p) if len(self.trans_mem) > self.mem_size: self.trans_mem.popleft() self.priority_mem.popleft()