def prepare_miniBatch(self, transitions_raw): transitions = ExtendedTransition(*zip(*transitions_raw)) action = torch.tensor(transitions.action, device=self.device, dtype=torch.long).unsqueeze( -1) # shape(batch, 1) reward = torch.tensor(transitions.reward, device=self.device, dtype=torch.float32).unsqueeze( -1) # shape(batch, 1) # for some env, the output state requires further processing before feeding to neural network if self.stateProcessor is not None: state, _ = self.stateProcessor(transitions.state, device=self.device) nonFinalNextState, nonFinalMask, finalNextState, finalMask = self.stateProcessor( transitions.next_state, device=self.device, done=transitions.done) else: raise NotImplementedError # state = torch.tensor(transitions.state, device=self.device, dtype=torch.float32) # non final if there is at least one stage not finish # nonFinalMask = torch.tensor(tuple(map(lambda s: not np.all(s['stage']), transitions.done)), device=self.device, dtype=torch.uint8) # nonFinalNextState = torch.tensor(transitions.next_state[nonFinalMask], device=self.device, dtype=torch.float32) # finalMask = [map(lambda s: s['global']), transitions.done] # finalNextState = torch.tensor(transitions.next_state[finalMask], device=self.device, dtype=torch.float32) return state, action, reward, nonFinalNextState, nonFinalMask, finalNextState, finalMask
def store_experience(self, state, action, nextState, reward, done, info): # if it is one step transition = ExtendedTransition(state, action, nextState, reward, done) self.memory.push(transition) if self.hindSightER: self.process_hindSightExperience(state, action, nextState, reward, done, info)
def process_hindSightExperience(self, state, action, nextState, reward, done, info): if not done and self.globalStepCount % self.hindSightERFreq == 0: stateNew, actionNew, nextStateNew, rewardNew, doneNew = self.env.getHindSightExperience( state, action, nextState, done, info) if stateNew is not None: transition = ExtendedTransition(stateNew, actionNew, nextStateNew, rewardNew, doneNew) self.memory.push(transition)
def store_experience(self, state, action, nextState, reward, done, info): if self.experienceProcessor is not None: state, action, nextState, reward = self.experienceProcessor( state, action, nextState, reward, done, info) # caution: using multiple step forward return can increase variance # if it is one step timeStep = state['stageID'] done['id'] = self.globalStepCount transition = ExtendedTransition(state, action, nextState, reward, done) #if done['stage'][0] and state['state'][1] < 1 and state['stageID'] == 0: # print('issue!!!!!!!') self.memories[timeStep].push(transition)
def prepare_minibatch(self, transitions_raw): # first store memory transitions = ExtendedTransition(*zip(*transitions_raw)) action = torch.tensor(transitions.action, device=self.device, dtype=torch.long).unsqueeze( -1) # shape(batch, 1) reward = torch.tensor(transitions.reward, device=self.device, dtype=torch.float32).unsqueeze( -1) # shape(batch, 1) # for some env, the output state requires further processing before feeding to neural network if self.stateProcessor is not None: state, _ = self.stateProcessor(transitions.state, self.device) nonFinalNextState, nonFinalMask, finalNextState, finalMask = self.stateProcessor( transitions.next_state, self.device, transitions.done) else: state = torch.tensor(transitions.state, device=self.device, dtype=torch.float32) nextState = torch.tensor(transitions.next_state, device=self.device, dtype=torch.float32) # final mask is one that have stage done finalMask = torch.tensor(transitions.done, device=self.device, dtype=torch.uint8) nonFinalMask = 1 - finalMask finalNextState = [ nextState[i] for i in range(self.trainBatchSize) if finalMask[i] ] nonFinalNextState = [ nextState[i] for i in range(self.trainBatchSize) if nonFinalMask[i] ] if len(nonFinalNextState): nonFinalNextState = torch.stack(nonFinalNextState) if len(finalNextState): finalNextState = torch.stack(finalNextState) return state, nonFinalMask, nonFinalNextState, finalMask, finalNextState, action, reward
def store_experience(self, state, action, nextState, reward, done, info): # if it is one step transition = ExtendedTransition(state, action, nextState, reward, done) self.memory.push(transition)