示例#1
0
    def __init__(self, params, name, task):
        super(A2C, self).__init__(params, name, task)
        if self.trainMode:
            self.aPars = params['actPars']
            self.aTrain = params['actTrain']
            self.value = Network(self.vPars, self.vTrain)
            self.policyNet = A2CNetwork(self.aPars, self.aTrain)
        else:
            self.policyNet = Network(self.aPars, self.aTrain)
            self.policyNet.load_state_dict(
                torch.load(
                    "/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/PolicyNet.txt"
                ))

        self.exp = Replay(self.batch_size)
        self.replaceCounter = 0
        self.valueLoss = []
        self.actorLoss = []
        self.avgLoss = 0
        self.avgActLoss = 0

        task.initAgent(self)

        while (not self.stop):
            x = 1 + 1
        task.postTraining()
示例#2
0
    def __init__(self, params, name, task):
        self.name           = name
        self.task           = task

        self.vPars          = params['valPars']
        self.vTrain         = params['valTrain']
        self.mPars          = params['mPars']
        self.mTrain         = params['mTrain']
        self.wPars          = params['actPars']
        self.wTrain         = params['actTrain']
        self.w_vPars        = params['w_vPars']
        self.w_vTrain       = params['w_vTrain']

        self.agents         = params['agents']
        self.pubs = {}
        for key in self.agents.keys():
            bot             = self.agents[key]
            self.pubs[key]  = rospy.Publisher(bot['pub'], Vector3, queue_size = 1)
        rospy.Subscriber("/finished", Int8, self.receiveDone, queue_size = 1)

        self.valueLoss      = []

        self.manager        = Network(self.mPars, self.mTrain)
        self.m_critic       = Network(self.vPars, self.vTrain) 
        self.m_critic_target= Network(self.vPars, self.vTrain)
        self.worker         = Network(self.wPars, self.wTrain)
        self.w_critic       = Network(self.w_vPars, self.w_vTrain)
        self.w_critic_target= Network(self.w_vPars, self.w_vTrain)

        self.m_discount     = self.vTrain['m_gamma']
        self.w_discount     = self.vTrain['w_gamma']
        self.lr             = self.vTrain['lr']
        self.trainMode      = self.vPars['trainMode']
        self.step           = self.vTrain['step']
        self.stop           = False
        self.c              = self.mTrain['c']
        self.tau            = .005
        self.noise          = Noise(self.manager.neurons[-1], theta = .4, max_sigma = .2, min_sigma = 0, decay = 1)

        self.exp            = Memory()
        self.temp           = []
        self.totalSteps     = 0
        self.soft           = nn.Softmax(dim=1)

        self.reset()

        task.initAgent(self)

        while(not self.stop):
            x = 1+1

        task.postTraining()
示例#3
0
 def __init__(self, params, name, task):
     super(TRPOAgent, self).__init__(params, name, task)
     self.valueNet = Network(self.vPars, self.vTrain)
     self.policyNet = Network(params['actPars'], params['actTrain'])
     self.running_state = ZFilter((self.vPars['in_n'], ), clip=5)
     self.running_reward = ZFilter((1, ), demean=False, clip=10)
     self.experience = Memory()
     self.valueLoss = []
     self.actorLoss = []
     self.avgLoss = 0
     task.initAgent(self)
     while (not self.stop):
         x = 1 + 1
     task.postTraining()
示例#4
0
    def __init__(self, params, name, task, load_path=None):
        super(DoubleQ, self).__init__(params, name, task)
        self.dual = self.vPars['dual']
        if self.trainMode:
            if self.dual:
                self.tarNet = DualNetwork(self.vPars, self.vTrain)
                self.valueNet = DualNetwork(self.vPars, self.vTrain)
            else:
                self.tarNet = Network(self.vPars, self.vTrain)
                self.valueNet = Network(self.vPars, self.vTrain)
                for target_param, param in zip(self.tarNet.parameters(), self.valueNet.parameters()):
                    target_param.data.copy_(param.data)
        else:
            self.valueNet = Network(self.vPars, self.vTrain)
            paths = ['/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/model_training_data/hierarchical_q_policy2.txt']
            if not load_path:
                self.valueNet = []
                for path in paths:
                    self.valueNet.append(Network(self.vPars, self.vTrain))
                    self.valueNet[-1].load_state_dict(torch.load(path))
            else:
                self.valueNet.load_state_dict(torch.load(load_path))
        self.out_n = self.vPars['neurons'][-1]
        self.replaceCounter = 0
        self.valueLoss = []
        self.avgLoss = 0
        self.expSize =self.vTrain['buffer']
        self.exp = Memory(size = self.expSize)

        self.priority = self.vTrain['priority']
        self.priorities = []
        self.beta = .5
        self.alpha = .7

        self.double = self.vTrain['double']
        if 'noise' in self.vTrain:
            self.noise = self.vTrain['noise']
        else:
            self.noise = 0
        

        task.initAgent(self)
    
        if not load_path:
            while(not self.stop):
                x = 1+1
            task.postTraining()
示例#5
0
    def __init__(self, params, name, task):
        super(SAC, self).__init__(params, name, task)
        self.aPars = params['actPars']
        self.aTrain = params['actTrain']
        self.qPars = params['qPars']
        self.qTrain = params['qTrain']
        if self.trainMode:
            self.QNet = Network(self.qPars, self.qTrain).to(device)
            self.VNet = Network(self.vPars, self.vTrain).to(device)
            self.VTar = Network(self.vPars, self.vTrain).to(device)
            self.policyNet = SoftNetwork(self.aPars, self.aTrain).to(device)
        else:
            print('Not implemented')

        for target_param, param in zip(self.VTar.parameters(),
                                       self.VNet.parameters()):
            target_param.data.copy_(param)

        self.expSize = self.vTrain['buffer']
        self.actions = self.aPars['neurons'][-1]
        self.state = self.aPars['neurons'][0]
        self.exp = ReplayBuffer(self.expSize, self.actions, np.float32,
                                self.state, np.float32)

        task.initAgent(self)

        while (not self.stop):
            x = 1 + 1
        task.postTraining()
示例#6
0
    def __init__(self, params, name, task):
        super(Twin_DDPG, self).__init__(params, name, task)
        self.aPars = params['actPars']
        self.aTrain = params['actTrain']

        if self.trainMode:
            self.values = [
                Network(self.vPars, self.vTrain),
                Network(self.vPars, self.vTrain)
            ]
            self.policyNet = TD3Network(self.aPars, self.aTrain)
            self.tarPolicy = TD3Network(self.aPars, self.aTrain)

            if self.load:
                self.load_nets()

            self.tarPolicy.load_state_dict(self.policyNet.state_dict())
            self.tar = [
                Network(self.vPars, self.vTrain),
                Network(self.vPars, self.vTrain)
            ]
            for i in range(len(self.values)):
                self.tar[i].load_state_dict(self.values[i].state_dict())
        else:
            self.policyNet = Network(self.aPars, self.aTrain)
            self.policyNet.load_state_dict(
                torch.load(
                    "/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/TD3_goal_policy2.txt"
                ))

        self.base = self.vTrain['baseExplore']
        self.step = self.vTrain['decay']
        self.expSize = self.vTrain['buffer']
        self.exp = Replay(self.expSize)
        self.a = self.vTrain['a']
        self.tau = self.vPars['tau']
        self.smooth = self.vTrain['smooth']
        self.clip = self.vTrain['clip']
        self.delay = self.vTrain['policy_delay']
        self.mean_range = self.aPars['mean_range']
        self.noise = OUNoise(self.out_n,
                             mu=0,
                             theta=.15,
                             max_sigma=self.explore,
                             min_sigma=self.base,
                             decay=self.step)
        self.valueLoss = []
        self.actorLoss = []
        self.avgLoss = 0
        self.avgActLoss = 0

        task.initAgent(self)

        while (not self.stop):
            x = 1 + 1
        task.postTraining()
示例#7
0
    def __init__(self, params, name, task, load_path=None):
        super(DoubleQ, self).__init__(params, name, task)
        self.dual = self.vPars['dual']
        if self.trainMode:
            if self.dual:
                self.tarNet = DualNetwork(self.vPars, self.vTrain)
                self.valueNet = DualNetwork(self.vPars, self.vTrain)
            else:
                self.tarNet = Network(self.vPars, self.vTrain)
                self.valueNet = Network(self.vPars, self.vTrain)
                for target_param, param in zip(self.tarNet.parameters(),
                                               self.valueNet.parameters()):
                    target_param.data.copy_(param.data)
        else:
            self.valueNet = Network(self.vPars, self.vTrain)
            self.valueNet.load_state_dict(torch.load(load_path))
        self.out_n = self.vPars['neurons'][-1]
        self.replaceCounter = 0
        self.valueLoss = []
        self.avgLoss = 0
        self.expSize = self.vTrain['buffer']
        self.exp = Memory(size=self.expSize)
        self.beta = self.vPars['beta']

        self.priority = self.vTrain['priority']
        self.priorities = []
        self.alpha = .7

        self.double = self.vTrain['double']
        self.update_target_network = self.vTrain['update_target_network_every']
        if 'noise' in self.vTrain:
            self.noise = self.vTrain['noise']
        else:
            self.noise = 0

        task.initAgent(self)

        if not load_path:
            while (not self.stop):
                x = 1 + 1
            task.postTraining()
示例#8
0
class DoubleQ(Agent):
    def __init__(self, params, name, task, load_path=None):
        super(DoubleQ, self).__init__(params, name, task)
        self.dual = self.vPars['dual']
        if self.trainMode:
            if self.dual:
                self.tarNet = DualNetwork(self.vPars, self.vTrain)
                self.valueNet = DualNetwork(self.vPars, self.vTrain)
            else:
                self.tarNet = Network(self.vPars, self.vTrain)
                self.valueNet = Network(self.vPars, self.vTrain)
                for target_param, param in zip(self.tarNet.parameters(),
                                               self.valueNet.parameters()):
                    target_param.data.copy_(param.data)
        else:
            self.valueNet = Network(self.vPars, self.vTrain)
            self.valueNet.load_state_dict(torch.load(load_path))
        self.out_n = self.vPars['neurons'][-1]
        self.replaceCounter = 0
        self.valueLoss = []
        self.avgLoss = 0
        self.expSize = self.vTrain['buffer']
        self.exp = Memory(size=self.expSize)
        self.beta = self.vPars['beta']

        self.priority = self.vTrain['priority']
        self.priorities = []
        self.alpha = .7

        self.double = self.vTrain['double']
        self.update_target_network = self.vTrain['update_target_network_every']
        if 'noise' in self.vTrain:
            self.noise = self.vTrain['noise']
        else:
            self.noise = 0

        task.initAgent(self)

        if not load_path:
            while (not self.stop):
                x = 1 + 1
            task.postTraining()

    def saveModel(self):
        torch.save(
            self.valueNet.state_dict(),
            '/home/jimmy/Documents/Research/AN_Bridging/results/hierarchical_q_policy2.txt'
        )
        pass

    def store(self, s, a, r, sprime, aprime, done):
        self.exp.push(s, a, r, 1 - done, aprime, sprime)
        if len(self.priorities) < self.expSize:
            self.priorities.append(1)
        else:
            self.priorities = self.priorities[1:]
            self.priorities.append(1)

    def get_q(self, s):
        if type(self.valueNet) == list:
            model_index = np.random.randint(len(self.valueNet))
            net = self.valueNet[model_index]
        else:
            net = self.valueNet
        q = net(torch.FloatTensor(s))
        q = q.detach()
        return q

    def get_action(self, s, testing_time=False, probabilistic=False):
        i = np.random.random()
        if i < self.explore and self.trainMode and not testing_time:
            index = np.random.randint(self.out_n)
        else:
            q = self.get_q(s)
            if probabilistic:
                q = q.numpy()
                q = q - np.max(q)
                probs = np.exp(q * self.beta)
                probs = probs / np.sum(probs)
                index = np.random.choice(q.size, p=probs.ravel())
                # print('probability chosen ', probs.ravel()[index])
            else:
                index = np.argmax(q.numpy())
        self.explore = max(.2, self.explore * .9997)
        return index

    def get_q_and_q_tar(self, states, actions, nextStates, rewards, masks):
        qValues = self.valueNet(
            torch.FloatTensor(states).squeeze(1))  #pass in. Processing implied
        q = torch.gather(
            qValues, 1,
            torch.LongTensor(actions).unsqueeze(1))  #get q values of actions
        qnext = self.tarNet(torch.FloatTensor(nextStates))
        qnext = qnext.squeeze(1).detach()  #pass in

        if self.double:
            qNextDouble = self.valueNet(torch.FloatTensor(nextStates))
            qNextDouble = qNextDouble.squeeze(1).detach()  #pass in
            qnext = torch.gather(
                qnext, 1, torch.LongTensor(qNextDouble.argmax(1).unsqueeze(1)))
            qtar = torch.FloatTensor(rewards).squeeze(
                1) + self.discount * torch.Tensor(masks).unsqueeze(1) * qnext
        else:
            qtar = torch.FloatTensor(rewards) + self.discount * torch.Tensor(
                masks).unsqueeze(1) * qnext.max(1)[0].view(
                    self.batch_size, 1)  #calculate target
        return q, qtar

    def train(self, override=False):
        if len(self.exp) >= 500 or override:
            if self.priority:
                loss = 0
                weights = []
                errors = []
                assert len(self.priorities) == len(self.exp)
                for i in range(self.batch_size):
                    probs = np.array(
                        [math.pow(p, self.alpha) for p in self.priorities])
                    probs = probs / np.sum(probs)
                    choice = np.random.choice(len(self.priorities),
                                              p=probs,
                                              size=1)
                    weights.append(
                        math.pow(
                            len(self.priorities) *
                            self.priorities[int(np.asscalar(choice))],
                            -self.beta))
                    states, actions, rewards, masks, _, nextStates, _, _, _ = self.exp.get_transitions(
                        choice)
                    q, qtar = self.get_q_and_q_tar(states, actions, nextStates,
                                                   rewards, masks)
                    td = qtar - q
                    self.priorities[int(np.asscalar(choice))] = abs(td[:, 0])
                    errors.append(self.valueNet.get_loss(q, qtar))
                max_weight = max(weights)
                weights = [w / max_weight for w in weights]
                val_loss = sum([w * e for w, e in zip(weights, errors)])

            else:
                states, actions, rewards, masks, _, nextStates, _, _, _ = self.exp.sample(
                    batch=self.batch_size)

                if self.replaceCounter % self.update_target_network == 0:
                    self.tarNet.load_state_dict(self.valueNet.state_dict())
                    self.replaceCounter = 0

                if self.noise:
                    states = np.array(states)
                    states = states + np.random.normal(0, self.noise,
                                                       states.shape)

                q, qtar = self.get_q_and_q_tar(states, actions, nextStates,
                                               rewards, masks)
                val_loss = self.valueNet.get_loss(q, qtar)

            self.valueNet.optimizer.zero_grad()
            val_loss.backward()
            self.valueNet.optimizer.step()

            self.replaceCounter += 1
            self.totalSteps += 1
            return val_loss
示例#9
0
class A2C(Agent):
    def __init__(self, params, name, task):
        super(A2C, self).__init__(params, name, task)
        if self.trainMode:
            self.aPars = params['actPars']
            self.aTrain = params['actTrain']
            self.value = Network(self.vPars, self.vTrain)
            self.policyNet = A2CNetwork(self.aPars, self.aTrain)
        else:
            self.policyNet = Network(self.aPars, self.aTrain)
            self.policyNet.load_state_dict(
                torch.load(
                    "/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/PolicyNet.txt"
                ))

        self.exp = Replay(self.batch_size)
        self.replaceCounter = 0
        self.valueLoss = []
        self.actorLoss = []
        self.avgLoss = 0
        self.avgActLoss = 0

        task.initAgent(self)

        while (not self.stop):
            x = 1 + 1
        task.postTraining()

    def saveModel(self):
        #print("Network saved")
        pass

    def get_action(self):
        output = self.policyNet(torch.FloatTensor(s))
        action_mean = output[:, :int(out_n / 2)]
        action_logstd = output[:, int(out_n / 2):]
        action_std = torch.exp(action_logstd)
        action = (torch.normal(action_mean,
                               action_std).detach().numpy()).ravel()
        return action

    def train(self):
        if self.dataSize == self.batch_size:
            self.totalSteps += 1
            s, a, r, n_s, n_a, mask = self.exp.get_data()
            mask = torch.FloatTensor(np.where(
                mask > .5, 0, 1))  #if fail, equal to 1 so set mask to 0

            #Critic update
            vTar = torch.FloatTensor(
                r) + self.discount * self.value(n_s).detach()
            v = self.value(s)
            loss = self.value.loss_fnc(v, vTar)
            self.value.optimizer.zero_grad()
            loss.backward()
            self.value.optimizer.step()
            self.avgLoss += loss

            #Policy update:
            advantage = (vTar - v).detach()
            out = self.policyNet(s)
            mean = out[:, :int(self.out_n)]
            log_std = out[:, int(self.out_n):]
            log_prob = Normal(mean, torch.exp(log_std)).log_prob(
                torch.FloatTensor(a))
            entropy = -torch.sum(torch.exp(log_prob) * log_prob)
            grad = -torch.sum(log_prob * advantage) + .01 * entropy
            self.policyNet.optimizer.zero_grad()
            grad.backward()
            self.policyNet.optimizer.step()
            self.avgActLoss += grad

            #iteration updates
            self.trainIt += 1
            self.dataSize = 0
示例#10
0
class HIRO(object):
    def __init__(self, params, name, task):
        self.name           = name
        self.task           = task

        self.vPars          = params['valPars']
        self.vTrain         = params['valTrain']
        self.mPars          = params['mPars']
        self.mTrain         = params['mTrain']
        self.wPars          = params['actPars']
        self.wTrain         = params['actTrain']
        self.w_vPars        = params['w_vPars']
        self.w_vTrain       = params['w_vTrain']

        self.agents         = params['agents']
        self.pubs = {}
        for key in self.agents.keys():
            bot             = self.agents[key]
            self.pubs[key]  = rospy.Publisher(bot['pub'], Vector3, queue_size = 1)
        rospy.Subscriber("/finished", Int8, self.receiveDone, queue_size = 1)

        self.valueLoss      = []

        self.manager        = Network(self.mPars, self.mTrain)
        self.m_critic       = Network(self.vPars, self.vTrain) 
        self.m_critic_target= Network(self.vPars, self.vTrain)
        self.worker         = Network(self.wPars, self.wTrain)
        self.w_critic       = Network(self.w_vPars, self.w_vTrain)
        self.w_critic_target= Network(self.w_vPars, self.w_vTrain)

        self.m_discount     = self.vTrain['m_gamma']
        self.w_discount     = self.vTrain['w_gamma']
        self.lr             = self.vTrain['lr']
        self.trainMode      = self.vPars['trainMode']
        self.step           = self.vTrain['step']
        self.stop           = False
        self.c              = self.mTrain['c']
        self.tau            = .005
        self.noise          = Noise(self.manager.neurons[-1], theta = .4, max_sigma = .2, min_sigma = 0, decay = 1)

        self.exp            = Memory()
        self.temp           = []
        self.totalSteps     = 0
        self.soft           = nn.Softmax(dim=1)

        self.reset()

        task.initAgent(self)

        while(not self.stop):
            x = 1+1

        task.postTraining()

    def receiveDone(self, message):
        if message.data  == 1: #all iterations are done. Check manager.py
            self.stop = True
        if message.data == 2: #timed out. Check manager.py
            self.task.restartProtocol(restart = 1)

    def get_action(self, s, s_w = None):
        s = torch.FloatTensor(s)
        if self.iteration % self.c == 0:
            self.goal = self.manager(s)
            noise = torch.FloatTensor(self.noise.get_noise())
            self.goal += noise
        else:
            self.goal = self.prevState[:,:2] + self.goal - s[:,:2]

        self.temp_second = self.temp_first
        self.temp_first = self.goal
        self.prevState = s
        s = s[:,:6]
        inpt = torch.cat((s, self.goal), dim=1)
        policy = self.worker(inpt)
        policy = self.soft(policy)
        choice = np.asscalar(self.choose(policy))
        self.iteration += 1
        return choice #single env

    def choose(self, policies):
        m = Categorical(policies)
        action = m.sample()
        action = action.data.cpu().numpy()
        return action
    
    def saveModel(self):
        pass

    def store(self, s, a, r, sprime, aprime, done):
        if self.temp_second != None:
            self.temp.append(Transition(s, a, r, 1-done, sprime, None, self.temp_second.detach().numpy(), self.goal.detach().numpy()))
            if self.iteration % self.c == 1 and self.iteration != 1: # remember, we push at 1 because we incremented in get_action
                self.temp = Transition(*zip(*self.temp))
                self.exp.push(self.temp) # store into exp
                self.temp = []


    def reset(self):
        self.iteration = 0
        self.temp_first, self.temp_second = (None, None)
        self.prevState = None
        self.temp = []
        return 
    
    def generateSamples(self, goals, states, next_states):
        next_states = next_states[:, :2]
        states = states[:, :2]
        candidates = (next_states - states).unsqueeze(0)
        candidates = torch.cat((candidates, goals.unsqueeze(0)), dim=0)
        normal = Normal(next_states - states, torch.ones(next_states.size()) / 2)
        sampled = normal.sample((8,))
        candidates = torch.cat((candidates, sampled), dim=0)
        # return shape (# candidates, batch_size, dimensions of goal)
        return candidates
    
    def getTransitions(self, initial_goals, states, next_states):
        # initial_goals shape: (# candidates ,batch_size, dimensions of goal)
        # states shape: (batch_size, c, dimensions of state)
        states = states[:,:, :2]
        next_states = next_states[:,:,:2]
        goals = [initial_goals.unsqueeze(0)]
        for c in range(self.c - 1):
            prev = goals[-1].squeeze(0)
            curr = states[:, c, :] + prev - next_states[:,c,:] # broadcast. This should take shape of initial_goals 
            goals.append(curr.unsqueeze(0))
        goals = torch.cat(goals, dim=0)
        goals = goals.transpose(0,1)
        goals = goals.transpose(1,2)
        # return shape (# candidates, batch_size, c, dimensions of goal)
        return goals
    
    def getProbabilities(self, transitions, states, actions):
        # transitions shape (# candidates, batch_size, c, dimensions of goal)
        # states shape: (batch_size, c, dimensions of state)
        # actions shape: (batch_size, c)
        states = states[:, :, :6]
        states = states.unsqueeze(0)
        size = states.size()
        states = states.expand(transitions.size()[0], size[1], size[2], size[3])
        inpt = torch.cat((states, transitions), dim=3)
        soft = nn.Softmax(dim = 3)
        actions = actions.expand(transitions.size()[0], actions.size()[0], actions.size()[1]).unsqueeze(3)
        probs = soft(self.worker(inpt)).gather(3, actions.long()).squeeze(3)
        probs = torch.prod(probs, dim=2)
        # return shape (# candidates, batch_size) of probabilities
        return probs

    def train(self):
        if len(self.exp) > 300:

            groups = self.exp.sample(self.step) # sample groupings of samples
            m_states = torch.cat(map(lambda g: torch.Tensor(g.state[0]), groups), dim=0)
            m_next_states = torch.cat(map(lambda g: torch.Tensor(g.next_state[-1]), groups), dim=0)
            m_goals = torch.cat(map(lambda g: torch.Tensor(g.goal[0]), groups), dim=0) 
            m_rewards = torch.Tensor(map(lambda g: sum(g.reward), groups)).squeeze(2)
            m_masks = torch.Tensor(map(lambda g: g.mask[-1], groups)).unsqueeze(1)

            w_states = torch.cat(map(lambda g: torch.Tensor(g.state).unsqueeze(0), groups), dim=0).squeeze()
            w_next_states = torch.cat(map(lambda g: torch.Tensor(g.next_state).unsqueeze(0), groups), dim=0).squeeze()
            w_actions = torch.cat(map(lambda g: torch.Tensor(g.action).unsqueeze(0), groups), dim=0)

            candidates = self.generateSamples(m_goals, m_states, m_next_states)
            cand_transitions = self.getTransitions(candidates, w_states, w_next_states)
            probs = self.getProbabilities(cand_transitions, w_states, w_actions)
            cand_indices = probs.argmax(dim=0).unsqueeze(0).unsqueeze(2)
            cand_indices = cand_indices.expand(cand_indices.size()[0], cand_indices.size()[1], candidates.size()[2])
            m_goals = candidates.gather(0, cand_indices).squeeze() #size: (batch_size, dimension of goals)
 
            states = []
            actions = []
            next_states = []
            masks = []
            goals = []
            next_goals = []
            for g in groups:
                states.append(torch.Tensor(g.state).squeeze()[:, :6])
                actions.append(torch.Tensor(g.action).squeeze())
                next_states.append(torch.Tensor(g.next_state).squeeze()[:, :6])
                masks.append(torch.Tensor(g.mask).squeeze())
                goals.append(torch.Tensor(g.goal).squeeze())  
                next_goals.append(torch.Tensor(g.next_goal).squeeze())

            states = torch.cat(states, dim=0)
            actions = torch.cat(actions, dim=0).unsqueeze(1)
            next_states = torch.cat(next_states, dim=0)
            masks = torch.cat(masks, dim=0).unsqueeze(1)
            goals = torch.cat(goals, dim=0) 
            next_goals = torch.cat(next_goals, dim=0)
            rewards = -torch.norm(states[:,:2] + goals - next_states[:,:2], dim=1).unsqueeze(1)

            # Manager critic
            q = self.m_critic(torch.cat((m_states, m_goals), dim=1))
            m_next_actions = self.manager(m_next_states)
            q_tar = m_rewards + self.m_discount * self.m_critic_target(torch.cat((m_next_states, m_next_actions), dim=1))
            loss = self.m_critic.get_loss(q, q_tar.detach())
            self.m_critic.optimizer.zero_grad()
            loss.backward()
            self.m_critic.optimizer.step()

            # Manager actor
            new_actions = self.manager(m_states)
            q = self.m_critic(torch.cat((m_states, new_actions), dim=1))
            loss = -q.mean()
            self.manager.optimizer.zero_grad()
            loss.backward()
            self.m_critic.optimizer.step()

            # Worker critic
            q = self.w_critic(torch.cat((states, goals), dim=1)).gather(1, actions.long())
            next_actions = self.worker(torch.cat((next_states, next_goals), dim=1))
            next_actions = self.choose(self.soft(next_actions))
            q_tar = rewards + self.w_discount * masks * self.w_critic_target(torch.cat((next_states, next_goals), dim=1)).gather(1, torch.Tensor(next_actions).long().unsqueeze(1))
            loss = self.w_critic.get_loss(q, q_tar.detach())
            self.w_critic.optimizer.zero_grad()
            loss.backward()
            self.w_critic.optimizer.step()

            # Worker actor 
            new_actions = self.worker(torch.cat((states[:,:6], goals), dim=1))
            policy = self.soft(new_actions)
            new_actions = self.choose(policy)
            q = self.w_critic(torch.cat((states, goals), dim=1))
            q = q.gather(1, torch.Tensor(new_actions).long().unsqueeze(1))
            loss = -q.mean()
            self.worker.optimizer.zero_grad()
            loss.backward()
            self.worker.optimizer.step()
    
            for target_param, param in zip(self.m_critic_target.parameters(), self.m_critic.parameters()):
                target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau)
            for target_param, param in zip(self.w_critic_target.parameters(), self.w_critic.parameters()):
                target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau)
            
            # Push updated replay entires into replay

            for i, goal in enumerate(m_goals):
                curr_group = groups[i]
                curr_goal = goal.unsqueeze(0).detach().numpy()
                inserts = (curr_goal)
                for j in range(self.c - 1):
                    curr_goal = curr_group.state[j][:,:2].reshape(1,-1) + curr_goal - curr_group.next_state[j][:,:2].reshape(1,-1)
                    inserts = inserts + (curr_goal)
                curr_group._replace(goal=inserts)
                self.exp.push(curr_group)

            self.totalSteps += 1

            return loss
示例#11
0
    def __init__(self, params, name, task):
        self.name = name
        self.task = task
        self.vTrain = params['valTrain']
        self.vPars = params['valPars']
        self.aTrain = params['actTrain']
        self.aPars = params['actPars']
        self.agents = params['agents']

        self.pubs = OrderedDict()
        for key in self.agents.keys():
            bot = self.agents[key]
            self.pubs[key] = rospy.Publisher(bot['pub'], Vector3, queue_size=1)
        rospy.Subscriber("/finished", Int8, self.receiveDone, queue_size=1)

        self.valueLoss = []
        self.actorLoss = []

        self.h_state_n = self.aPars['h_state_n']
        self.x_state_n = self.aPars['x_state_n']
        self.u_n = self.aPars['u_n']
        self.clip_grad_norm = self.aTrain['clip']
        self.homogenous = self.aPars['share_params']

        self.critic = Network(self.vPars, self.vTrain).to(device)
        self.target = Network(self.vPars, self.vTrain).to(device)
        if self.homogenous:
            self.actor = CounterActor(self.aPars, self.aTrain).to(device)
        else:
            self.actor = [
                CounterActor(self.aPars, self.aTrain)
                for i in range(len(self.agents))
            ]

        for target_param, param in zip(self.target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(param.data)

        self.clip_grad_norm = self.aTrain['clip']
        self.trainMode = self.vPars['trainMode']
        self.batch_size = self.vTrain['batch']
        self.discount = self.vTrain['gamma']
        self.temp_second = None
        self.temp_first = None
        self.td_lambda = 0  # TEST: this is because we are doing ER off-policy
        self.tau = .01
        self.stop = False
        self.trained = False

        self.exp = Memory()

        self.totalSteps = 0

        self.reset()

        task.initAgent(self)

        while (not self.stop):
            x = 1 + 1

        task.postTraining()
示例#12
0
class Counter(object):
    def __init__(self, params, name, task):
        self.name = name
        self.task = task
        self.vTrain = params['valTrain']
        self.vPars = params['valPars']
        self.aTrain = params['actTrain']
        self.aPars = params['actPars']
        self.agents = params['agents']

        self.pubs = OrderedDict()
        for key in self.agents.keys():
            bot = self.agents[key]
            self.pubs[key] = rospy.Publisher(bot['pub'], Vector3, queue_size=1)
        rospy.Subscriber("/finished", Int8, self.receiveDone, queue_size=1)

        self.valueLoss = []
        self.actorLoss = []

        self.h_state_n = self.aPars['h_state_n']
        self.x_state_n = self.aPars['x_state_n']
        self.u_n = self.aPars['u_n']
        self.clip_grad_norm = self.aTrain['clip']
        self.homogenous = self.aPars['share_params']

        self.critic = Network(self.vPars, self.vTrain).to(device)
        self.target = Network(self.vPars, self.vTrain).to(device)
        if self.homogenous:
            self.actor = CounterActor(self.aPars, self.aTrain).to(device)
        else:
            self.actor = [
                CounterActor(self.aPars, self.aTrain)
                for i in range(len(self.agents))
            ]

        for target_param, param in zip(self.target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(param.data)

        self.clip_grad_norm = self.aTrain['clip']
        self.trainMode = self.vPars['trainMode']
        self.batch_size = self.vTrain['batch']
        self.discount = self.vTrain['gamma']
        self.temp_second = None
        self.temp_first = None
        self.td_lambda = 0  # TEST: this is because we are doing ER off-policy
        self.tau = .01
        self.stop = False
        self.trained = False

        self.exp = Memory()

        self.totalSteps = 0

        self.reset()

        task.initAgent(self)

        while (not self.stop):
            x = 1 + 1

        task.postTraining()

    def receiveDone(self, message):
        if message.data == 1:  #all iterations are done. Check manager.py
            self.stop = True
        if message.data == 2:  #timed out. Check manager.py
            self.task.restartProtocol(restart=1)

    def get_action(self, s_true, s_split):
        if self.homogenous:
            policy1 = self.actor(torch.FloatTensor(s_split[0]))
            a1 = np.asscalar(self.choose(policy1))
            policy2 = self.actor(torch.FloatTensor(s_split[1]))
            a2 = np.asscalar(self.choose(policy2))
        else:
            policy1 = self.actor[0](torch.FloatTensor(s_split[0]))
            a1 = self.choose(policy1)
            policy2 = self.actor[1](torch.FloatTensor(s_split[1]))
            a2 = self.choose(policy2)
        # THIS IS A TEST
        a1 = 0
        #print(policy1)
        #print(policy2)
        #print('')
        return [a1, a2]

    def choose(self, policies):
        m = Categorical(policies)
        action = m.sample()
        action = action.data.cpu().numpy()
        return action

    def saveModel(self):
        pass

    def store(self, s, a, r, sprime, aprime, done, local, next_local):
        self.exp.push(s, a, r, 1 - done, aprime, sprime, local, next_local,
                      None)

    def reset(self):
        self.train(True)
        if self.trained:
            self.actor.eps = max(.05, self.actor.eps - .003)
        self.trained = False
        self.temp_first, self.temp_second = (None, None)
        self.h = [
            torch.zeros((1, 1, self.h_state_n))
            for i in range(len(self.agents))
        ]
        self.prevAction = [-1, -1]
        return

    def zipStack(self, data):
        data = zip(*data)
        data = [torch.stack(d).squeeze().to(device) for d in data]
        return data

    def get_lambda_targets(self, rewards, mask, gamma, target_qs):
        target_qs = target_qs.squeeze()
        ret = target_qs.new_zeros(*target_qs.shape)
        ret[-1] = rewards[-1] + target_qs[-1] * mask[-1]

        for t in range(ret.size()[0] - 2, -1, -1):
            ret[t] = mask[t] * (self.td_lambda * gamma * ret[t + 1]) + (
                rewards[t] +
                (1 - self.td_lambda) * gamma * target_qs[t] * mask[t])
        return ret.unsqueeze(1)

    def train(self, episode_done=False):
        if len(self.exp) > self.batch_size:
            transition = self.exp.sample(self.batch_size)
            states = torch.squeeze(torch.Tensor(transition.state)).to(device)
            states_next = torch.squeeze(torch.Tensor(
                transition.next_state)).to(device)
            actions = torch.Tensor(transition.action).float().to(device)
            rewards = torch.Tensor(transition.reward).to(device)
            masks = torch.Tensor(transition.mask).to(device)
            local = self.zipStack(transition.local)
            next_local = self.zipStack(transition.next_local)

            actions_next = []
            for s in next_local:
                next_policy = self.actor(s)
                next_action = self.choose(next_policy)
                actions_next.append(torch.Tensor(next_action))
            '''# Critic Update
            ID = torch.Tensor(states.size()[0], 1).fill_(-1)
            inp = torch.cat((states_next, actions_next[1].unsqueeze(1), ID), dim = 1)
            q_tar = self.target(inp).detach().gather(1, actions_next[0].long().unsqueeze(1))
            q_tar = self.get_lambda_targets(rewards.squeeze(), masks.squeeze(), self.discount, q_tar)
            inp = torch.cat((states, actions[:, 1].unsqueeze(1), ID), dim = 1)
            q = self.critic(inp)
            q = q.gather(1, actions[:, 0].long().unsqueeze(1))
            loss = self.critic.get_loss(q, q_tar)
            self.critic.optimizer.zero_grad()
            loss.backward()
            self.critic.optimizer.step()'''

            ID = torch.Tensor(states.size()[0], 1).fill_(1)
            inp = torch.cat((states_next, actions_next[0].unsqueeze(1), ID),
                            dim=1)
            q_tar = self.target(inp).detach().gather(
                1, actions_next[1].long().unsqueeze(1))  # .max(1)?
            q_tar = self.get_lambda_targets(rewards.squeeze(), masks.squeeze(),
                                            self.discount, q_tar)
            inp = torch.cat((states, actions[:, 0].unsqueeze(1), ID), dim=1)
            q = self.critic(inp)
            q = q.gather(1, actions[:, 1].long().unsqueeze(1))
            loss = self.critic.get_loss(q, q_tar)
            self.critic.optimizer.zero_grad()
            loss.backward()
            self.critic.optimizer.step()

            actor_loss = 0
            # Actor Update. Consider doing new_actions
            policies = []
            new_actions = []
            for s in local:
                policy = self.actor(s)
                policies.append(policy)
                new_action = self.choose(policy)
                new_actions.append(torch.Tensor(new_action))
            '''ID = torch.Tensor(states.size()[0], 1).fill_(-1)
            inp = torch.cat((states, new_actions[1].unsqueeze(1), ID), dim = 1)
            q_out = self.critic(inp) #batch x num_actions
            policy = policies[0] #batch x num_actions
            mult = q_out * policy
            baseline = torch.sum(mult, 1).unsqueeze(1)
            q_taken = q_out.gather(1, new_actions[0].long().unsqueeze(1))
            coma = (q_taken - baseline).detach()
            probs_taken = policy.gather(1, new_actions[0].long().unsqueeze(1))
            loss = -(torch.log(probs_taken) * coma).mean()
            actor_loss += loss '''

            ID = torch.Tensor(states.size()[0], 1).fill_(1)
            inp = torch.cat((states, new_actions[0].unsqueeze(1), ID), dim=1)
            q_out = self.critic(inp)  #batch x num_actions
            policy = policies[1]  #batch x num_actions
            mult = q_out * policy
            baseline = torch.sum(mult, 1).unsqueeze(1)
            q_taken = q_out.gather(1, new_actions[1].long().unsqueeze(1))
            coma = (q_taken - baseline).detach()
            probs_taken = policy.gather(1, new_actions[1].long().unsqueeze(1))
            loss = -(torch.log(probs_taken) * coma).mean()
            actor_loss += loss

            self.actorLoss.append(actor_loss)

            if self.homogenous:
                self.actor.optimizer.zero_grad()
                actor_loss.backward()
                nn.utils.clip_grad_norm_(self.actor.parameters(), 1)
                self.actor.optimizer.step()
            else:
                for actor in self.actor:
                    actor.optimizer.zero_grad()
                actor_loss.backward()
                for actor in self.actor:
                    actor.optimizer.step()

            self.totalSteps += 1
            # self.exp = Memory()
            self.trained = True

            #UPDATE TARGET NETWORK:
            if self.totalSteps % 1 == 0:  # THIS IS A TEST
                for target_param, param in zip(self.target.parameters(),
                                               self.critic.parameters()):
                    target_param.data.copy_((1 - self.tau) * target_param +
                                            self.tau * param.data)

            return
示例#13
0
                               shape=[
                                   None, ds_args.img_width, ds_args.img_height,
                                   ds_args.num_channels
                               ],
                               name="input_node")
            y = tf.placeholder(tf.float32, shape=[None, ds_args.num_classes])
            is_training = tf.placeholder(tf.bool, shape=[], name="is_training")

            # hints = tf.ragged.placeholder(tf.float32, ragged_rank=[[args.batch_size, 8, 8, 100],
            #                                                       [args.batch_size, 8, 8, 100]])

            switch_idx = tf.placeholder(tf.int32, shape=[])
            switch = tf.placeholder(tf.float32, shape=[])

        architecture = utils.get_architecture(args, ds_args)
        model = MyNetwork(architecture=architecture)
        model.build()

        logits_op = model(x,
                          is_training=is_training,
                          switch_idx=switch_idx,
                          switch=switch)  # hints=hints

        with tf.variable_scope("loss"):
            loss_op = tf.reduce_mean(
                tf.nn.softmax_cross_entropy_with_logits_v2(y, logits_op))

        with tf.variable_scope("regularisation"):
            # Add the regularisation terms
            reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            loss_op += loss_op + sum(reg_losses)
示例#14
0
class DoubleQ(Agent):
    def __init__(self, params, name, task, load_path=None):
        super(DoubleQ, self).__init__(params, name, task)
        self.dual = self.vPars['dual']
        if self.trainMode:
            if self.dual:
                self.tarNet = DualNetwork(self.vPars, self.vTrain)
                self.valueNet = DualNetwork(self.vPars, self.vTrain)
            else:
                self.tarNet = Network(self.vPars, self.vTrain)
                self.valueNet = Network(self.vPars, self.vTrain)
                for target_param, param in zip(self.tarNet.parameters(),
                                               self.valueNet.parameters()):
                    target_param.data.copy_(param.data)
        else:
            self.valueNet = Network(self.vPars, self.vTrain)
            paths = [
                '/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/push_in_hole.txt',
                '/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/push_in_hole2.txt'
            ]
            if not load_path:
                self.valueNet = []
                for path in paths:
                    self.valueNet.append(Network(self.vPars, self.vTrain))
                    self.valueNet[-1].load_state_dict(torch.load(path))
            else:
                self.valueNet.load_state_dict(torch.load(load_path))
        self.out_n = self.vPars['neurons'][-1]
        self.replaceCounter = 0
        self.valueLoss = []
        self.avgLoss = 0
        self.expSize = self.vTrain['buffer']
        self.exp = Memory(size=self.expSize)
        self.double = self.vTrain['double']

        task.initAgent(self)

        if not load_path:
            while (not self.stop):
                x = 1 + 1
            task.postTraining()

    def saveModel(self):
        torch.save(
            self.valueNet.state_dict(),
            '/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/box_push_hierarchical_q_policy.txt'
        )
        pass

    def store(self, s, a, r, sprime, aprime, done):
        self.exp.push(s, a, r, 1 - done, aprime, sprime)

    def get_action(self, s):
        i = np.random.random()
        if i < self.explore and self.trainMode:
            index = np.random.randint(self.out_n)
        else:
            if type(self.valueNet) == list:
                model_index = np.random.randint(len(self.valueNet))
                net = self.valueNet[model_index]
            else:
                net = self.valueNet
            q = net(torch.FloatTensor(s))
            #print(q)
            q = q.detach()
            index = np.argmax(q.numpy())
        self.explore = max(.1, self.explore * .9997)
        return index

    def train(self):
        if len(self.exp) >= 500:
            states, actions, rewards, masks, _, nextStates, _, _, _ = self.exp.sample(
                batch=self.batch_size)

            if self.replaceCounter % 500 == 0:  # THIS IS SET TO 200 FOR BOX PUSH TASK...SLOPE IS 500
                self.tarNet.load_state_dict(self.valueNet.state_dict())
                self.replaceCounter = 0

            qValues = self.valueNet(torch.FloatTensor(states).squeeze(
                1))  #pass in. Processing implied
            q = torch.gather(qValues, 1,
                             torch.LongTensor(actions).unsqueeze(
                                 1))  #get q values of actions
            qnext = self.tarNet(torch.FloatTensor(nextStates))
            qnext = qnext.squeeze(1).detach()  #pass in

            if self.double:
                qNextDouble = self.valueNet(torch.FloatTensor(nextStates))
                qNextDouble = qNextDouble.squeeze(1).detach()  #pass in
                qnext = torch.gather(
                    qnext, 1,
                    torch.LongTensor(qNextDouble.argmax(1).unsqueeze(1)))
                qtar = torch.FloatTensor(rewards).squeeze(
                    1
                ) + self.discount * torch.Tensor(masks).unsqueeze(1) * qnext
            else:
                qtar = torch.FloatTensor(
                    rewards) + self.discount * torch.Tensor(
                        masks).unsqueeze(1) * qnext.max(1)[0].view(
                            self.batch_size, 1)  #calculate target

            val_loss = self.valueNet.get_loss(q, qtar)
            self.valueNet.optimizer.zero_grad()
            val_loss.backward()
            self.valueNet.optimizer.step()

            self.replaceCounter += 1
            self.totalSteps += 1
示例#15
0
class Twin_DDPG(Agent):
    def __init__(self, params, name, task):
        super(Twin_DDPG, self).__init__(params, name, task)
        self.aPars = params['actPars']
        self.aTrain = params['actTrain']

        if self.trainMode:
            self.values = [
                Network(self.vPars, self.vTrain),
                Network(self.vPars, self.vTrain)
            ]
            self.policyNet = TD3Network(self.aPars, self.aTrain)
            self.tarPolicy = TD3Network(self.aPars, self.aTrain)

            if self.load:
                self.load_nets()

            self.tarPolicy.load_state_dict(self.policyNet.state_dict())
            self.tar = [
                Network(self.vPars, self.vTrain),
                Network(self.vPars, self.vTrain)
            ]
            for i in range(len(self.values)):
                self.tar[i].load_state_dict(self.values[i].state_dict())
        else:
            self.policyNet = Network(self.aPars, self.aTrain)
            self.policyNet.load_state_dict(
                torch.load(
                    "/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/TD3_goal_policy2.txt"
                ))

        self.base = self.vTrain['baseExplore']
        self.step = self.vTrain['decay']
        self.expSize = self.vTrain['buffer']
        self.exp = Replay(self.expSize)
        self.a = self.vTrain['a']
        self.tau = self.vPars['tau']
        self.smooth = self.vTrain['smooth']
        self.clip = self.vTrain['clip']
        self.delay = self.vTrain['policy_delay']
        self.mean_range = self.aPars['mean_range']
        self.noise = OUNoise(self.out_n,
                             mu=0,
                             theta=.15,
                             max_sigma=self.explore,
                             min_sigma=self.base,
                             decay=self.step)
        self.valueLoss = []
        self.actorLoss = []
        self.avgLoss = 0
        self.avgActLoss = 0

        task.initAgent(self)

        while (not self.stop):
            x = 1 + 1
        task.postTraining()

    def load_nets(self):
        path = "/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/TD3_goal3_"
        self.policyNet.load_state_dict(torch.load(path + "policy.txt"))
        self.values[0].load_state_dict(torch.load(path + "Qvalue1.txt"))
        self.values[1].load_state_dict(torch.load(path + "Qvalue2.txt"))

    def saveModel(self):
        path = "/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/TD3_goal3_"
        torch.save(self.policyNet.state_dict(), path + "policy.txt")
        torch.save(self.values[0].state_dict(), path + "Qvalue1.txt")
        torch.save(self.values[1].state_dict(), path + "Qvalue2.txt")
        print("Network saved")
        pass

    def get_action(self):
        output = self.policyNet(torch.FloatTensor(s))
        i = np.random.random()
        if i < self.explore[0]:
            #add in exploration TODO: put in OU noise
            noise = torch.from_numpy(np.random.normal(0, self.explore[1], 2))
            output = output + noise
        output = output.float()
        return output[0]

    def train(self):
        if self.dataSize > 500 and self.trainMode:
            #iteration updates
            self.trainIt += 1
            self.totalSteps += 1

            #Unpack
            s, a, r, n_s, n_a, done = self.exp.get_data()
            noise = torch.FloatTensor(
                np.random.normal(0, self.smooth, n_a.shape))

            c = np.random.choice(min(self.dataSize, self.expSize),
                                 self.batch_size)

            s = torch.FloatTensor(s[c])
            a = torch.FloatTensor(a[c])
            r = torch.FloatTensor(r[c])
            n_s = torch.FloatTensor(n_s[c])
            done = torch.FloatTensor(done[c])
            n_a = self.tarPolicy(n_s).detach().numpy()

            #target policy smoothing
            n_a_ = n_a + torch.clamp(noise, -self.clip, self.clip)
            n_sa = torch.cat((n_s, n_a), dim=1)
            qtar = torch.FloatTensor(r) + self.discount * (
                1 - done) * torch.min(self.tar[0](n_sa).detach(), self.tar[1]
                                      (n_sa).detach())  #pass in

            #Value update
            sa = torch.cat((s, a), dim=1)
            for qnet in self.values:
                q = qnet(sa)
                loss = qnet.loss_fnc(q, qtar)
                qnet.optimizer.zero_grad()
                loss.backward()
                qnet.optimizer.step()
                qnet.scheduler.step()
                self.avgLoss += loss / len(self.values)

            #policy update
            if self.trainIt % self.delay == 0:
                act = self.policyNet(s)
                s_a = torch.cat((s, act), 1)
                q = self.values[0](s_a)
                policy_loss = -q.mean()

                self.policyNet.optimizer.zero_grad()
                policy_loss.backward()
                self.policyNet.optimizer.step()
                self.policyNet.scheduler.step()
                self.avgActLoss += policy_loss

                for target_param, param in zip(self.tarPolicy.parameters(),
                                               self.policyNet.parameters()):
                    target_param.data.copy_(self.tau * param.data +
                                            (1.0 - self.tau) *
                                            target_param.data)

                for i in range(len(self.values)):
                    for target_param, param in zip(
                            self.tar[i].parameters(),
                            self.values[i].parameters()):
                        target_param.data.copy_(self.tau * param.data +
                                                (1.0 - self.tau) *
                                                target_param.data)
class CounterContinuous(object):
    def __init__(self, params, name, task):
        self.name = name
        self.task = task
        self.vTrain = params['valTrain']
        self.vPars = params['valPars']
        self.aTrain = params['actTrain']
        self.aPars = params['actPars']
        self.agents = params['agents']

        self.pubs = {}
        for key in self.agents.keys():
            bot = self.agents[key]
            self.pubs[key] = rospy.Publisher(bot['pub'], Vector3, queue_size=1)
        rospy.Subscriber("/finished", Int8, self.receiveDone, queue_size=1)

        self.valueLoss = []
        self.actorLoss = []

        self.h_state_n = self.aPars['h_state_n']
        self.x_state_n = self.aPars['x_state_n']
        self.u_n = self.aPars['u_n']
        self.clip_grad_norm = self.aTrain['clip']
        self.homogenous = self.aPars['share_params']

        self.critic = Network(self.vPars, self.vTrain).to(device)
        self.target = Network(self.vPars, self.vTrain).to(device)
        if self.homogenous:
            self.actor = SoftNetwork(self.aPars, self.aTrain).to(device)
        else:
            self.actor = [
                SoftNetwork(self.aPars, self.aTrain)
                for i in range(len(self.agents))
            ]

        for target_param, param in zip(self.target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(param.data)

        self.clip_grad_norm = self.aTrain['clip']
        self.trainMode = self.vPars['trainMode']
        self.batch_size = self.vTrain['batch_size']
        self.discount = self.vTrain['gamma']
        self.range = self.aPars['mean_range']
        self.td_lambda = .8
        self.tau = .005
        self.lower_bound = self.aTrain['clamp'][2]
        self.stop = False
        self.trained = False

        self.exp = Memory()

        self.totalSteps = 0

        self.reset()

        task.initAgent(self)

        while (not self.stop):
            x = 1 + 1

        task.postTraining()

    def receiveDone(self, message):
        if message.data == 1:  #all iterations are done. Check manager.py
            self.stop = True
        if message.data == 2:  #timed out. Check manager.py
            self.task.restartProtocol(restart=1)

    def get_action(self, s_true, s_split):
        if self.homogenous:
            a1, log_prob1, z, mu1, log_std1 = self.actor(
                torch.FloatTensor(s_split[0]))
            a2, log_prob2, z, mu2, log_std2 = self.actor(
                torch.FloatTensor(s_split[1]))
        else:  # TODO: Fix this below:
            a1, h_new1, log_prob1, mu1, std1 = self.actor[0](torch.FloatTensor(
                s_split[0]), self.h[0])
            a2, h_new2, log_prob2, mu2, std2 = self.actor[1](torch.FloatTensor(
                s_split[1]), self.h[1])
        action = [a1.detach().numpy().ravel(), a2.detach().numpy().ravel()]
        return [a1, a2]

    def choose(self, policies):
        m = Categorical(policies)
        action = m.sample()
        action = action.data.cpu().numpy()
        return np.asscalar(action)

    def saveModel(self):
        pass

    def store(self, s, a, r, sprime, aprime, done, local, next_local):
        self.exp.push(s, a, r, 1 - done, aprime, sprime, local, next_local)

    def reset(self):
        curr = self.actor.clamp[0]
        if self.trained:
            new = max(self.lower_bound, .05 * self.lower_bound + .95 * curr)
            self.actor.clamp = (new, self.actor.clamp[1], self.lower_bound)
        self.trained = False
        return

    def get_grad_norm(self, model):
        total_norm = 0
        for p in model.parameters():
            if p.grad is None:
                continue
            param_norm = p.grad.data.norm(2)
            total_norm += param_norm.item()**2
        grad_norm = total_norm**(1. / 2)
        return grad_norm

    def get_lambda_targets(self, rewards, mask, gamma, target_qs):
        target_qs = target_qs.squeeze()
        ret = target_qs.new_zeros(*target_qs.shape)
        ret[-1] = target_qs[-1] * mask[-1]

        for t in range(ret.size()[0] - 2, -1, -1):
            ret[t] = self.td_lambda * gamma * ret[t + 1] + \
                mask[t] * (rewards[t] + (1 - self.td_lambda) * gamma * target_qs[t + 1])
        return ret.unsqueeze(1)

    def zipStack(self, data):
        data = zip(*data)
        data = [torch.stack(d).squeeze().to(device) for d in data]
        return data

    def monte_carlo(self, mean, std, n=500):
        # returns tensors representing n sampled from mean and std
        normal = Normal(mean, std)
        return normal.sample((n, ))

    def train(self, episode_done=False):
        if len(self.exp) >= 500:

            transition = self.exp.sample(self.batch_size)
            states = torch.squeeze(torch.Tensor(transition.state)).to(device)
            actions = self.zipStack(transition.action)
            rewards = torch.Tensor(transition.reward).to(device)
            states_next = torch.squeeze(torch.Tensor(
                transition.next_state)).to(device)
            masks = torch.Tensor(transition.mask).to(device)
            local = self.zipStack(transition.local)
            next_local = self.zipStack(transition.next_local)

            actions_next = []
            for s in next_local:
                a, log_prob, _, _, _ = self.actor(s)
                actions_next.append(a.detach())
            inp = torch.cat((states_next, actions_next[0], actions_next[1]),
                            dim=1)
            q_tar = rewards.unsqueeze(
                1) + self.discount * masks.unsqueeze(1) * self.target(inp)
            inp = torch.cat((states, actions[0].detach(), actions[1].detach()),
                            dim=1)
            q = self.critic(inp)
            loss = self.critic.get_loss(q, q_tar.detach())
            self.critic.optimizer.zero_grad()
            loss.backward()
            self.critic.optimizer.step()
            self.valueLoss.append(loss)

            actor_loss = 0
            actions = []
            means = []
            log_stds = []
            log_probs = []
            for s in local:
                a, log_prob, z, mu, log_std = self.actor(s)
                actions.append(a)
                means.append(mu)
                log_stds.append(log_std)
                log_probs.append(log_prob)

            # train first agent
            inp = torch.cat((states, actions[0], actions[1].detach()), dim=1)
            q_out = self.critic(inp)
            samples = self.monte_carlo(means[0], log_stds[0].exp())
            samples = self.range * torch.tanh(samples)
            repeat_s = states.unsqueeze(0)
            repeat_s = repeat_s.expand(samples.size()[0],
                                       repeat_s.size()[1],
                                       repeat_s.size()[2])
            repeat_a = actions[1].unsqueeze(0)
            repeat_a = repeat_a.expand(samples.size()[0],
                                       repeat_a.size()[1],
                                       repeat_a.size()[2])
            inp = torch.cat((repeat_s, samples, repeat_a), dim=2)
            baseline = self.critic(inp).mean(0)
            coma = (q_out - baseline).detach()
            actor_loss -= (log_probs[0].view(coma.size()) * (coma)).mean()

            # train second agent
            inp = torch.cat((states, actions[0].detach(), actions[1]), dim=1)
            q_out = self.critic(inp)
            samples = self.monte_carlo(means[1], log_stds[1].exp())
            samples = self.range * torch.tanh(samples)
            repeat_a = actions[0].unsqueeze(0)
            repeat_a = repeat_a.expand(samples.size()[0],
                                       repeat_a.size()[1],
                                       repeat_a.size()[2])
            inp = torch.cat((repeat_s, repeat_a, samples), dim=2)
            baseline = self.critic(inp).mean(0)
            coma = (q_out - baseline).detach()
            actor_loss -= (log_probs[1].view(coma.size()) * (coma)).mean()

            if self.homogenous:
                self.actor.optimizer.zero_grad()
                actor_loss.backward()
                self.actor.optimizer.step()
            else:
                for actor in self.actor:
                    actor.optimizer.zero_grad()
                actor_loss.backward()
                for actor in self.actor:
                    torch.nn.utils.clip_grad_norm_(actor.parameters(),
                                                   self.clip_grad_norm)
                    actor.optimizer.step()
            self.totalSteps += 1
            self.trained = True

            #UPDATE TARGET NETWORK:
            if self.totalSteps % 50 == 0:
                for target_param, param in zip(self.target.parameters(),
                                               self.critic.parameters()):
                    target_param.data.copy_(param.data)
            return
    def __init__(self, params, name, task):
        self.name = name
        self.task = task
        self.vTrain = params['valTrain']
        self.vPars = params['valPars']
        self.aTrain = params['actTrain']
        self.aPars = params['actPars']
        self.agents = params['agents']

        self.pubs = {}
        for key in self.agents.keys():
            bot = self.agents[key]
            self.pubs[key] = rospy.Publisher(bot['pub'], Vector3, queue_size=1)
        rospy.Subscriber("/finished", Int8, self.receiveDone, queue_size=1)

        self.valueLoss = []
        self.actorLoss = []

        self.h_state_n = self.aPars['h_state_n']
        self.x_state_n = self.aPars['x_state_n']
        self.u_n = self.aPars['u_n']
        self.clip_grad_norm = self.aTrain['clip']
        self.homogenous = self.aPars['share_params']

        self.critic = Network(self.vPars, self.vTrain).to(device)
        self.target = Network(self.vPars, self.vTrain).to(device)
        if self.homogenous:
            self.actor = SoftNetwork(self.aPars, self.aTrain).to(device)
        else:
            self.actor = [
                SoftNetwork(self.aPars, self.aTrain)
                for i in range(len(self.agents))
            ]

        for target_param, param in zip(self.target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(param.data)

        self.clip_grad_norm = self.aTrain['clip']
        self.trainMode = self.vPars['trainMode']
        self.batch_size = self.vTrain['batch_size']
        self.discount = self.vTrain['gamma']
        self.range = self.aPars['mean_range']
        self.td_lambda = .8
        self.tau = .005
        self.lower_bound = self.aTrain['clamp'][2]
        self.stop = False
        self.trained = False

        self.exp = Memory()

        self.totalSteps = 0

        self.reset()

        task.initAgent(self)

        while (not self.stop):
            x = 1 + 1

        task.postTraining()
示例#18
0
    def __init__(self, params, name, task):
        self.name = name
        self.task = task
        self.vTrain = params['valTrain']  # Counterfactual network
        self.vPars = params['valPars']
        self.aTrain = params['actTrain']  # Local Actors
        self.aPars = params['actPars']
        self.m_params = params['m_pars']  # Manager
        self.m_train = params['m_train']
        self.local_vPars = params['local_pars']  # Local values
        self.local_vTrain = params['local_train']
        self.agents = params['agents']  # Agents

        self.pubs = {}
        for key in self.agents.keys():
            bot = self.agents[key]
            self.pubs[key] = rospy.Publisher(bot['pub'], Vector3, queue_size=1)
        rospy.Subscriber("/finished", Int8, self.receiveDone, queue_size=1)

        self.tau = self.vPars['tau']
        self.trainMode = self.vPars['trainMode']
        self.batch_size = self.vTrain['batch']
        self.td_lambda = .8

        self.c = self.m_params['c']
        self.w_discount = self.vTrain['gamma']
        self.m_discount = self.m_train['gamma']
        self.prevState = None
        self.soft = nn.Softmax(dim=1)

        self.exp = Memory()
        self.valueLoss = []
        self.actorLoss = []
        self.temp = []
        self.goal_temp1 = None
        self.goal_temp2 = None
        self.iteration = 0
        self.totalSteps = 0

        self.counter_critic, self.counter_target = (Network(
            self.vPars, self.vTrain), Network(self.vPars, self.vTrain))
        self.manager = Network(self.m_params, self.m_train)  # manager
        self.actor = Network(self.aPars, self.aTrain)  # actor
        self.critic, self.target = (Network(self.local_vPars,
                                            self.local_vTrain),
                                    Network(self.local_vPars,
                                            self.local_vTrain))

        for target_param, param in zip(self.target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(param.data)
        for target_param, param in zip(self.counter_target.parameters(),
                                       self.counter_critic.parameters()):
            target_param.data.copy_(param.data)

        self.reset()

        task.initAgent(self)
        self.stop = False
        while (not self.stop):
            x = 1 + 1

        task.postTraining()
示例#19
0
class SAC(Agent):
    def __init__(self, params, name, task):
        super(SAC, self).__init__(params, name, task)
        self.aPars = params['actPars']
        self.aTrain = params['actTrain']
        self.qPars = params['qPars']
        self.qTrain = params['qTrain']
        if self.trainMode:
            self.QNet = Network(self.qPars, self.qTrain).to(device)
            self.VNet = Network(self.vPars, self.vTrain).to(device)
            self.VTar = Network(self.vPars, self.vTrain).to(device)
            self.policyNet = SoftNetwork(self.aPars, self.aTrain).to(device)
        else:
            print('Not implemented')

        for target_param, param in zip(self.VTar.parameters(),
                                       self.VNet.parameters()):
            target_param.data.copy_(param)

        self.expSize = self.vTrain['buffer']
        self.actions = self.aPars['neurons'][-1]
        self.state = self.aPars['neurons'][0]
        self.exp = ReplayBuffer(self.expSize, self.actions, np.float32,
                                self.state, np.float32)

        task.initAgent(self)

        while (not self.stop):
            x = 1 + 1
        task.postTraining()

    def load_nets(self):
        pass

    def saveModel(self):
        pass

    def get_action(self, s):
        action, _, _, _, _ = self.policyNet(torch.FloatTensor(s))
        action = np.ravel(action.detach().numpy())
        return action

    def send_to_device(self, s, a, r, next_s, d):
        s = torch.FloatTensor(s).to(device)
        a = torch.FloatTensor(a).to(device)
        r = torch.FloatTensor(r).unsqueeze(1).to(device)
        next_s = torch.FloatTensor(next_s).to(device)
        d = torch.FloatTensor(np.float32(d)).unsqueeze(1).to(device)
        return s, a, r, next_s, d

    def train(self):
        if len(self.exp) > 750:
            s, a, r, next_s, d = self.exp.sample_batch(self.batch_size)
            s, a, r, next_s, d = self.send_to_device(s, a, r, next_s, d)

            q = self.QNet(torch.cat([s, a], dim=1))
            v = self.VNet(s)
            new_a, log_prob, z, mean, log_std = self.policyNet(s)

            target_v = self.VTar(next_s)

            next_q = r + (1 - d) * self.discount * target_v
            q_loss = self.QNet.get_loss(q, next_q.detach())

            new_q = self.QNet(torch.cat([s, new_a], dim=1))
            next_v = new_q - log_prob * self.alpha
            v_loss = self.VNet.get_loss(v, next_v.detach())

            target = new_q - v
            actor_loss = (log_prob *
                          (log_prob * self.alpha - target).detach()).mean()

            mean_loss = 1e-3 * mean.pow(2).mean()
            std_loss = 1e-3 * log_std.pow(2).mean()
            actor_loss += mean_loss + std_loss

            self.VNet.optimizer.zero_grad()
            v_loss.backward()
            self.VNet.optimizer.step()

            self.QNet.optimizer.zero_grad()
            q_loss.backward()
            self.QNet.optimizer.step()

            self.policyNet.optimizer.zero_grad()
            actor_loss.backward()
            self.policyNet.optimizer.step()

            for target_param, param in zip(self.VTar.parameters(),
                                           self.VNet.parameters()):
                target_param.data.copy_(target_param.data * (1.0 - 5 * 1e-3) +
                                        param.data * 5 * 1e-3)

            self.totalSteps += 1
示例#20
0
class TRPOAgent(Agent):
    def __init__(self, params, name, task):
        super(TRPOAgent, self).__init__(params, name, task)
        self.valueNet = Network(self.vPars, self.vTrain)
        self.policyNet = Network(params['actPars'], params['actTrain'])
        self.running_state = ZFilter((self.vPars['in_n'], ), clip=5)
        self.running_reward = ZFilter((1, ), demean=False, clip=10)
        self.experience = Memory()
        self.valueLoss = []
        self.actorLoss = []
        self.avgLoss = 0
        task.initAgent(self)
        while (not self.stop):
            x = 1 + 1
        task.postTraining()

    def saveModel(self):
        torch.save(
            self.valueNet.state_dict(),
            "/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/TRPOCritic.txt"
        )
        torch.save(
            self.policyNet.state_dict(),
            "/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/TRPOPolicy.txt"
        )
        print("Network saved")

    def train(self):
        batch = self.experience.sample()
        self.update_params(batch)

    def store(self, prevS, prevA, r, s, a, failure):
        mask = 0 if failure == 1 else 1
        self.experience.push(prevS, prevA, mask, s, r)

    def update_params(self, batch):
        rewards = torch.Tensor(batch.reward)
        masks = torch.Tensor(batch.mask)
        actions = torch.Tensor(np.concatenate(batch.action, 0))
        states = torch.Tensor(batch.state)
        values = self.valueNet(Variable(states))

        returns = torch.Tensor(actions.size(0), 1)
        deltas = torch.Tensor(actions.size(0), 1)
        advantages = torch.Tensor(actions.size(0), 1)

        prev_return = 0
        prev_value = 0
        prev_advantage = 0
        for i in reversed(range(rewards.size(0))):
            returns[i] = rewards[i] + self.discount * prev_return * masks[i]
            deltas[i] = rewards[
                i] + self.discount * prev_value * masks[i] - values.data[i]
            advantages[i] = deltas[
                i] + self.discount * tau * prev_advantage * masks[i]

            prev_return = returns[i, 0]
            prev_value = values.data[i, 0]
            prev_advantage = advantages[i, 0]

        targets = Variable(returns)

        # Original code uses the same LBFGS to optimize the value loss
        def get_value_loss(flat_params):
            set_flat_params_to(self.valueNet, torch.Tensor(flat_params))
            for param in self.valueNet.parameters():
                if param.grad is not None:
                    param.grad.data.fill_(0)

            values_ = self.valueNet(Variable(states))

            value_loss = (values_ - targets).pow(2).mean()

            # weight decay
            for param in self.valueNet.parameters():
                value_loss += param.pow(2).sum() * l2Reg
            value_loss.backward()
            return (value_loss.data.double().numpy(),
                    get_flat_grad_from(self.valueNet).data.double().numpy())

        flat_params, _, opt_info = scipy.optimize.fmin_l_bfgs_b(
            get_value_loss,
            get_flat_params_from(self.valueNet).double().numpy(),
            maxiter=25)
        set_flat_params_to(self.valueNet, torch.Tensor(flat_params))

        advantages = (advantages - advantages.mean()) / advantages.std()

        output = self.policyNet(Variable(states)).view(-1, self.u_n * 2)
        action_means = output.narrow(1, 0, self.u_n)
        action_log_stds = output.narrow(1, self.u_n, self.u_n)
        action_stds = torch.exp(action_log_stds)

        fixed_log_prob = normal_log_density(Variable(actions), action_means,
                                            action_log_stds,
                                            action_stds).data.clone()

        def get_loss(volatile=False):
            if volatile:
                with torch.no_grad():
                    output = self.policyNet(Variable(states))
            else:
                output = self.policyNet(Variable(states))

            output = output.view(-1, self.u_n * 2)
            action_means = output.narrow(1, 0, self.u_n)
            action_log_stds = output.narrow(1, self.u_n, self.u_n)
            action_stds = torch.exp(action_log_stds)

            log_prob = normal_log_density(Variable(actions), action_means,
                                          action_log_stds, action_stds)
            action_loss = -Variable(advantages) * torch.exp(
                log_prob - Variable(fixed_log_prob))
            return action_loss.mean()

        def get_kl():
            output = self.policyNet(Variable(states))
            output = output.view(-1, self.u_n * 2)
            mean1 = output.narrow(1, 0, self.u_n)
            log_std1 = output.narrow(1, self.u_n, self.u_n)
            std1 = torch.exp(action_log_stds)

            mean0 = Variable(mean1.data)
            log_std0 = Variable(log_std1.data)
            std0 = Variable(std1.data)
            kl = log_std1 - log_std0 + (std0.pow(2) +
                                        (mean0 - mean1).pow(2)) / (
                                            2.0 * std1.pow(2)) - 0.5
            return kl.sum(1, keepdim=True)

        loss = trpo_step(self.policyNet, get_loss, get_kl, maxKL, damping)
        self.avgLoss += loss
        self.trainIt += 1