def store_experience(self, state, action, nextState, reward, info):

        if self.experienceProcessor is not None:
            state, action, nextState, reward = self.experienceProcessor(
                state, action, nextState, reward, info)
        # caution: using multiple step forward return can increase variance
        if self.nStepForward > 1:

            # if this is final state, we want to do additional backup to increase useful learning experience
            if nextState is None:
                transitions = []
                transitions.append(Transition(state, action, nextState,
                                              reward))
                R = reward
                while len(self.nStepBuffer) > 0:
                    state, action, next_state, reward_old = self.nStepBuffer.pop(
                        0)
                    R = reward_old + self.gamma * R
                    transNew = Transition(state, action, None, R)
                    transitions.append(transNew)
                for tran in transitions:
                    if self.priorityMemoryOption:
                        self.memory.store(tran)
                    else:
                        self.memory.push(tran)

            else:
                # otherwise we calculate normal n step return
                self.nStepBuffer.append((state, action, nextState, reward))

                if len(self.nStepBuffer) < self.nStepForward:
                    return

                R = sum([
                    self.nStepBuffer[i][3] * (self.gamma**i)
                    for i in range(self.nStepForward)
                ])

                state, action, _, _ = self.nStepBuffer.pop(0)

                transition = Transition(state, action, nextState, R)

                if self.priorityMemoryOption:
                    self.memory.store(transition)
                else:
                    self.memory.push(transition)

        else:
            # if it is one step
            transition = Transition(state, action, nextState, reward)

            if self.priorityMemoryOption:
                self.memory.store(transition)
            else:
                self.memory.push(transition)
示例#2
0
    def prepare_minibatch(self, state, action, nextState, reward, info):
        # first store memory

        self.store_experience(state, action, nextState, reward, info)
        if len(self.memory) < self.trainBatchSize:
            return
        transitions_raw = self.memory.sample(self.trainBatchSize)
        transitions = Transition(*zip(*transitions_raw))
        action = torch.tensor(transitions.action,
                              device=self.device,
                              dtype=torch.float32)  # shape(batch, numActions)
        reward = torch.tensor(transitions.reward,
                              device=self.device,
                              dtype=torch.float32)  # shape(batch)

        # for some env, the output state requires further processing before feeding to neural network
        if self.stateProcessor is not None:
            state, _ = self.stateProcessor(transitions.state, self.device)
            nonFinalNextState, nonFinalMask = self.stateProcessor(
                transitions.next_state, self.device)
        else:
            state = torch.tensor(transitions.state,
                                 device=self.device,
                                 dtype=torch.float32)
            nonFinalMask = torch.tensor(tuple(
                map(lambda s: s is not None, transitions.next_state)),
                                        device=self.device,
                                        dtype=torch.uint8)
            nonFinalNextState = torch.tensor(
                [s for s in transitions.next_state if s is not None],
                device=self.device,
                dtype=torch.float32)

        return state, nonFinalMask, nonFinalNextState, action, reward
示例#3
0
    def prepare_minibatch(self, transitions_raw):
        '''
        do some proprocessing work for transitions_raw
        order the data
        convert transition list to torch tensors
        use trick from https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html
        https://stackoverflow.com/questions/19339/transpose-unzip-function-inverse-of-zip/19343#19343
        '''

        transitions = Transition(*zip(*transitions_raw))
        action = torch.tensor(transitions.action, device=self.device, dtype=torch.long).unsqueeze(-1)  # shape(batch, 1)
        reward = torch.tensor(transitions.reward, device=self.device, dtype=torch.float32).unsqueeze(-1)  # shape(batch, 1)

        # for some env, the output state requires further processing before feeding to neural network
        if self.stateProcessor is not None:
            state, _ = self.stateProcessor(transitions.state, self.device)
            nonFinalNextState, nonFinalMask = self.stateProcessor(transitions.next_state, self.device)
        else:
            state = torch.tensor(transitions.state, device=self.device, dtype=torch.float32)
            nonFinalMask = torch.tensor(tuple(map(lambda s: s is not None, transitions.next_state)), device=self.device,
                                        dtype=torch.bool)
            nonFinalNextState = torch.tensor([s for s in transitions.next_state if s is not None], device=self.device,
                                             dtype=torch.float32)

        return state, nonFinalMask, nonFinalNextState, action, reward
示例#4
0
    def sample(self, batch_size):
        finish = random.sample(range(0, len(self.memory)), batch_size)
        begin = [x - self.seq_length for x in finish]
        samples = []
        for start, end in zip(begin, finish):
            # correct for sampling near beginning
            # final is a list
            final = self.memory[max(start + 1, 0):end + 1]

            # correct for sampling across episodes
            # remove experiences belongs to previous episode
            for i in range(len(final) - 2, -1, -1):
                if final[i][3] is None:
                    final = final[i + 1:]
                    break

            # pad beginning to for sequence that end earlier
            while (len(final) < self.seq_length):
                dummyTransition = Transition(np.zeros_like(self.memory[0][0]), 0, np.zeros_like(self.memory[0][2]), 0)
                final = [dummyTransition] + final

            samples += final

        # returns flattened version
        return samples
    def store_experience(self, states, actions, nextStates, reward, info):

        transitions = [
            Transition(states[n], actions[n], nextStates[n], reward)
            for n in range(self.numAgents)
        ]
        self.memory.push(transitions)
    def prepare_minibatch(self, transitions_raw, n):
        # first store memory

        transitions = Transition(*zip(*transitions_raw))
        action = torch.tensor(transitions.action,
                              device=self.device,
                              dtype=torch.long).unsqueeze(
                                  -1)  # shape(batch, 1)
        reward = torch.tensor(transitions.reward,
                              device=self.device,
                              dtype=torch.float32).unsqueeze(
                                  -1)  # shape(batch, 1)

        # for some env, the output state requires further processing before feeding to neural network
        if self.stateProcessors is not None:
            state, _ = self.stateProcessors[n](transitions.state, self.device)
            nonFinalNextState, nonFinalMask = self.stateProcessors[n](
                transitions.next_state, self.device)
        else:
            state = torch.tensor(transitions.state,
                                 device=self.device,
                                 dtype=torch.float32)
            nonFinalMask = torch.tensor(tuple(
                map(lambda s: s is not None, transitions.next_state)),
                                        device=self.device,
                                        dtype=torch.uint8)
            nonFinalNextState = torch.tensor(
                [s for s in transitions.next_state if s is not None],
                device=self.device,
                dtype=torch.float32)

        return state, nonFinalMask, nonFinalNextState, action, reward
示例#7
0
 def process_experienceAugmentation(self, state, action, nextState, reward,
                                    info):
     if self.globalStepCount % self.experienceAugmentationFreq == 0:
         state_Augs, action_Augs, nextState_Augs, reward_Augs = self.env.getExperienceAugmentation(
             state, action, nextState, reward, info)
         for i in range(len(state_Augs)):
             transition = Transition(state_Augs[i], action_Augs[i],
                                     nextState_Augs[i], reward_Augs[i])
             self.memory.push(transition)
示例#8
0
    def store_experience(self, states, actions, nextStates, rewards, infos):

        for i in range(len(states)):
            # if it is ended due to stepLimit, we should not store the experience due to the vec env setup
            if not infos[i]['endBeforeDone']:
                transition = Transition(states[i], actions[i], nextStates[i], rewards[i])
                self.memory.push(transition)
                if self.successRepeat and nextStates[i] is None:
                    for _ in range(self.successRepeatTime):
                        self.memory.push(transition)
示例#9
0
 def process_hindSightExperience(self, state, action, nextState, reward,
                                 info):
     if nextState is not None and self.globalStepCount % self.hindSightERFreq == 0:
         stateNew, actionNew, nextStateNew, rewardNew = self.env.getHindSightExperience(
             state, action, nextState, info)
         if stateNew is not None:
             transition = Transition(stateNew, actionNew, nextStateNew,
                                     rewardNew)
             self.memory.push(transition)
             if self.experienceAugmentation:
                 self.process_experienceAugmentation(
                     state, action, nextState, reward, info)
    def store_experience(self, state, action, nextState, reward, info):
        if self.experienceProcessor is not None:
            state, action, nextState, reward = self.experienceProcessor(state, action, nextState, reward, info)

        timeStep = state['timeStep']
        transition = Transition(state, action, nextState, reward)
        self.memories[timeStep].push(transition)

        if self.experienceAugmentation:
            self.process_experienceAugmentation(state, action, nextState, reward, info)

        if self.hindSightER:
            self.process_hindSightExperience(state, action, nextState, reward, info)
    def push(self, *args):
        """Saves a transition"""
        if len(args) == 1 and isinstance(*args, Transition):
            transition = args[0]
        else:
            transition = Transition(*args)

        # if it is terminal state
        if transition.next_state is None:

            if len(self.terminalMemory) < self.capacity:
                self.terminalMemory.append(None)

            self.terminalMemory[self.positionTwo] = transition
            self.positionTwo = (self.positionTwo + 1) % self.capacity
            count = 1
            R = transition.reward
            for trans in self.nStepBuffer[::-1]:
                # if nstepBackup is zero, then this is no backup
                if count > self.nStepBackup:
                    break
                R = trans.reward + self.gamma * R
                transNew = Transition(trans.state, trans.action, None, R)
                if len(self.terminalMemory) < self.capacity:
                    self.terminalMemory.append(None)
                self.terminalMemory[self.positionTwo] = transNew
                self.positionTwo = (self.positionTwo + 1) % self.capacity
                count += 1

            self.nStepBuffer.clear()
        else:  # if it is terminal state
            if len(self.memory) < self.capacity:
                self.memory.append(None)

            self.nStepBuffer.append(transition)
            # write on the earlier experience
            self.memory[self.positionOne] = transition
            self.positionOne = (self.positionOne + 1) % self.capacity
    def update_net(self, state, action, nextState, reward):
        # first store memory

        self.store_experience(state, action, nextState, reward)

        if len(self.memory) < self.trainBatchSize:
            return

        transitions_raw = self.memory.sample(self.trainBatchSize)

        transitions = Transition(*zip(*transitions_raw))

        # for some env, the output state requires further processing before feeding to neural network
        if self.stateProcessor is not None:
            state = self.stateProcessor(transitions.state)
            nextState = self.stateProcessor(transitions.next_state)
        else:
            state = torch.tensor(transitions.state, dtype=torch.float32)
            nextState = torch.tensor(transitions.next_state, dtype=torch.float32)

        action = torch.tensor(transitions.action, dtype=torch.long).unsqueeze(-1) # shape(batch, 1)
        reward = torch.tensor(transitions.reward, dtype=torch.float32).unsqueeze(-1) # shape(batch, 1)

        batchSize = reward.shape[0]

        QValues = self.policyNet(state).gather(1, action)
        # note that here use policyNet for target value
        QNext = self.policyNet(nextState).detach()
        targetValues = reward + self.gamma * QNext.max(dim=1)[0].unsqueeze(-1)

        loss = torch.mean(self.netLossFunc(QValues, targetValues))

        self.optimizer.zero_grad()

        loss.backward()

        # for lp, gp in zip(self.localNet.parameters(), self.globalNet.parameters()):
        #     gp._grad = lp._grad
        #
        # if self.netGradClip is not None:
        #     torch.nn.utils.clip_grad_norm_(self.policyNet.parameters(), self.netGradClip)
        #
        # # global net update
        # self.globalOptimizer.step()
        #
        # # update local net
        # self.localNet.load_state_dict(self.globalNet.state_dict())

        if self.globalStepCount % self.lossRecordStep == 0:
            self.losses.append([self.globalStepCount, self.epIdx, loss])
示例#13
0
    def update_net_and_sync(self, state, action, nextState, reward):

        self.store_experience(state, action, nextState, reward)

        if self.priorityMemoryOption:
            if len(self.memory) < self.config['memoryCapacity']:
                return
        else:
            if len(self.memory) < self.trainBatchSize:
                return

        if self.totalStep % self.updateGlobalFrequency == 0:
            transitions_raw = self.memory.sample(self.trainBatchSize)
            transitions = Transition(*zip(*transitions_raw))
            action = torch.tensor(transitions.action, device=self.device, dtype=torch.long).unsqueeze(
                -1)  # shape(batch, 1)
            reward = torch.tensor(transitions.reward, device=self.device, dtype=torch.float32).unsqueeze(
                -1)  # shape(batch, 1)
            batchSize = reward.shape[0]


            # for some env, the output state requires further processing before feeding to neural network
            if self.stateProcessor is not None:
                state, _ = self.stateProcessor(transitions.state, self.device)
                nonFinalNextState, nonFinalMask = self.stateProcessor(transitions.next_state, self.device)
            else:
                state = torch.tensor(transitions.state, device=self.device, dtype=torch.float32)
                nonFinalMask = torch.tensor([s is not None for s in transitions.next_state],
                                            device=self.device, dtype=torch.uint8)
                nonFinalNextState = torch.tensor([s for s in transitions.next_state if s is not None],
                                                 device=self.device, dtype=torch.float32)
            if self.synchLock:

                self.lock.acquire()
                QValues = self.globalPolicyNet(state).gather(1, action)

                if self.netUpdateOption == 'targetNet':
                    # Here we detach because we do not want gradient flow from target values to net parameters
                    QNext = torch.zeros(batchSize, device=self.device, dtype=torch.float32)
                    QNext[nonFinalMask] = self.globalTargetNet(nonFinalNextState).max(1)[0].detach()
                    targetValues = reward + self.gamma * QNext.unsqueeze(-1)
                if self.netUpdateOption == 'policyNet':
                    raise NotImplementedError
                    targetValues = reward + self.gamma * torch.max(self.globalPolicyNet(nextState).detach(), dim=1)[0].unsqueeze(-1)
                if self.netUpdateOption == 'doubleQ':
                     # select optimal action from policy net
                     with torch.no_grad():
                        batchAction = self.globalPolicyNet(nonFinalNextState).max(dim=1)[1].unsqueeze(-1)
                        QNext = torch.zeros(batchSize, device=self.device, dtype=torch.float32).unsqueeze(-1)
                        QNext[nonFinalMask] = self.globalTargetNet(nonFinalNextState).gather(1, batchAction)
                        targetValues = reward + self.gamma * QNext

                loss = self.netLossFunc(QValues, targetValues)

                self.globalOptimizer.zero_grad()

                loss.backward()

                if self.netGradClip is not None:
                    torch.nn.utils.clip_grad_norm_(self.globalPolicyNet.parameters(), self.netGradClip)

                # global net update
                self.globalOptimizer.step()
                #
                # # update local net
                self.localNet.load_state_dict(self.globalPolicyNet.state_dict())

                self.lock.release()
            else:

                # update local net
                self.localNet.load_state_dict(self.globalPolicyNet.state_dict())

                QValues = self.localNet(state).gather(1, action)

                if self.netUpdateOption == 'targetNet':
                    # Here we detach because we do not want gradient flow from target values to net parameters
                    QNext = torch.zeros(batchSize, device=self.device, dtype=torch.float32)
                    QNext[nonFinalMask] = self.globalTargetNet(nonFinalNextState).max(1)[0].detach()
                    targetValues = reward + self.gamma * QNext.unsqueeze(-1)
                if self.netUpdateOption == 'policyNet':
                    raise NotImplementedError
                    targetValues = reward + self.gamma * torch.max(self.globalPolicyNet(nextState).detach(), dim=1)[
                        0].unsqueeze(-1)
                if self.netUpdateOption == 'doubleQ':
                    # select optimal action from policy net
                    with torch.no_grad():
                        batchAction = self.localNet(nonFinalNextState).max(dim=1)[1].unsqueeze(-1)
                        QNext = torch.zeros(batchSize, device=self.device, dtype=torch.float32).unsqueeze(-1)
                        QNext[nonFinalMask] = self.globalTargetNet(nonFinalNextState).gather(1, batchAction)
                        targetValues = reward + self.gamma * QNext

                loss = self.netLossFunc(QValues, targetValues)

                loss.backward()

                self.lock.acquire()

                self.globalOptimizer.zero_grad()

                for lp, gp in zip(self.localNet.parameters(), self.globalPolicyNet.parameters()):
                    if self.device == 'cpu':
                        gp._grad = lp._grad
                    else:
                        gp._grad = lp._grad.cpu()

                if self.netGradClip is not None:
                    torch.nn.utils.clip_grad_norm_(self.globalPolicyNet.parameters(), self.netGradClip)

                # global net update
                self.globalOptimizer.step()

                self.lock.release()
                #
                # # update local net
                self.localNet.load_state_dict(self.globalPolicyNet.state_dict())
示例#14
0
    def update_net(self, state, action, nextState, reward, info):

        # state, nonFinalMask, nonFinalNextState, action, reward = self.prepare_minibatch(state, action, nextState, reward, info)
        self.store_experience(state, action, nextState, reward, info)
        if len(self.memory) < self.trainBatchSize:
            return
        transitions_raw = self.memory.sample(self.trainBatchSize)
        transitions = Transition(*zip(*transitions_raw))
        action = torch.tensor(transitions.action,
                              device=self.device,
                              dtype=torch.float32)  # shape(batch, numActions)
        reward = torch.tensor(transitions.reward,
                              device=self.device,
                              dtype=torch.float32)  # shape(batch)

        # for some env, the output state requires further processing before feeding to neural network
        if self.stateProcessor is not None:
            state, _ = self.stateProcessor(transitions.state, self.device)
            nonFinalNextState, nonFinalMask = self.stateProcessor(
                transitions.next_state, self.device)
        else:
            state = torch.tensor(transitions.state,
                                 device=self.device,
                                 dtype=torch.float32)
            nonFinalMask = torch.tensor(tuple(
                map(lambda s: s is not None, transitions.next_state)),
                                        device=self.device,
                                        dtype=torch.uint8)
            nonFinalNextState = torch.tensor(
                [s for s in transitions.next_state if s is not None],
                device=self.device,
                dtype=torch.float32)

        batchSize = reward.shape[0]

        # Critic loss
        QValuesOne = self.criticNetOne.forward(state, action).squeeze()
        QValuesTwo = self.criticNetTwo.forward(state, action).squeeze()

        actionNoise = torch.randn((nonFinalNextState.shape[0], self.numAction),
                                  dtype=torch.float32,
                                  device=self.device)
        next_actions = self.actorNet_target.forward(
            nonFinalNextState) + actionNoise * self.policySmoothNoise

        # next_actions = self.actorNet_target.forward(nonFinalNextState)

        QNext = torch.zeros(batchSize, device=self.device, dtype=torch.float32)
        QNextCriticOne = self.criticNet_targetOne.forward(
            nonFinalNextState, next_actions.detach()).squeeze()
        QNextCriticTwo = self.criticNet_targetTwo.forward(
            nonFinalNextState, next_actions.detach()).squeeze()

        QNext[nonFinalMask] = torch.min(QNextCriticOne, QNextCriticTwo)

        targetValues = reward + self.gamma * QNext

        criticOne_loss = self.netLossFunc(QValuesOne, targetValues)
        criticTwo_loss = self.netLossFunc(QValuesTwo, targetValues)

        self.criticOne_optimizer.zero_grad()
        self.criticTwo_optimizer.zero_grad()

        # https://jdhao.github.io/2017/11/12/pytorch-computation-graph/
        criticOne_loss.backward(retain_graph=True)
        criticTwo_loss.backward()

        if self.netGradClip is not None:
            torch.nn.utils.clip_grad_norm_(self.criticNetOne.parameters(),
                                           self.netGradClip)
            torch.nn.utils.clip_grad_norm_(self.criticNetTwo.parameters(),
                                           self.netGradClip)

        self.criticOne_optimizer.step()
        self.criticTwo_optimizer.step()

        if self.learnStepCounter % self.policyUpdateFreq:
            # Actor loss
            # we try to maximize criticNet output(which is state value)
            policy_loss = -self.criticNetOne.forward(
                state, self.actorNet.forward(state)).mean()

            # update networks
            self.actor_optimizer.zero_grad()
            policy_loss.backward()
            if self.netGradClip is not None:
                torch.nn.utils.clip_grad_norm_(self.actorNet.parameters(),
                                               self.netGradClip)

            self.actor_optimizer.step()

            if self.globalStepCount % self.lossRecordStep == 0:
                self.losses.append([
                    self.globalStepCount, self.epIdx,
                    criticOne_loss.item(),
                    criticTwo_loss.item(),
                    policy_loss.item()
                ])

                # update target networks
                for target_param, param in zip(
                        self.actorNet_target.parameters(),
                        self.actorNet.parameters()):
                    target_param.data.copy_(param.data * self.tau +
                                            target_param.data *
                                            (1.0 - self.tau))

                for target_param, param in zip(
                        self.criticNet_targetOne.parameters(),
                        self.criticNetOne.parameters()):
                    target_param.data.copy_(param.data * self.tau +
                                            target_param.data *
                                            (1.0 - self.tau))

                for target_param, param in zip(
                        self.criticNet_targetTwo.parameters(),
                        self.criticNetTwo.parameters()):
                    target_param.data.copy_(param.data * self.tau +
                                            target_param.data *
                                            (1.0 - self.tau))

        self.learnStepCounter += 1
    def store_experience(self, states, actions, nextStates, rewards, infos):

        for i in range(len(states)):
            transition = Transition(states[i], actions[i], nextStates[i],
                                    rewards[i])
            self.memory.push(transition)
示例#16
0
from Agents.Core.ReplayMemory import ReplayMemory, Transition
#from ..Agents.Core.ReplayMemory import ReplayMemory, Transition
import torch

tran1 = Transition(1, 1, 1, 1)
tran2 = Transition(2, 2, 2, 2)
memory = ReplayMemory(10)
memory.push(tran1)
memory.push(tran2)
memory.push(3, 3, 3, 3)
print(memory)

memory.write_to_text('memoryOut.txt')

toTensor = memory.totensor()
toTensor2 = torch.tensor(memory.sample(2))
for i in range(5, 50):
    tran = Transition(i, i, i, i)
    memory.push(tran)

print(memory)
memory.clear()
print(memory)
print(toTensor)
print(toTensor2)
示例#17
0
from Agents.Core.ReplayMemory import ReplayMemory, Transition
#from ..Agents.Core.ReplayMemory import ReplayMemory, Transition
import torch
import numpy as np
import pickle

state1 = np.random.rand(5, 5)
state2 = np.random.rand(5, 5)
state3 = np.random.rand(5, 5)
state4 = np.random.rand(5, 5)

tran1 = Transition(state1, 1, state2, 1)
tran2 = Transition(state3, 2, state4, 2)
memory = ReplayMemory(10)
memory.push(tran1)
memory.push(tran2)
print(memory)

file = open('memory.pickle', 'wb')
pickle.dump(memory, file)
file.close()

with open('memory.pickle', 'rb') as file:
    memory2 = pickle.load(file)

print(memory2)
    def store_experience(self, state, action, nextState, reward, info):

        if self.experienceProcessor is not None:
            state, action, nextState, reward = self.experienceProcessor(state, action, nextState, reward, info)
        transition = Transition(state, action, nextState, reward)
        self.memory.push(transition)