示例#1
0
class HIRO(object):
    def __init__(self, params, name, task):
        self.name           = name
        self.task           = task

        self.vPars          = params['valPars']
        self.vTrain         = params['valTrain']
        self.mPars          = params['mPars']
        self.mTrain         = params['mTrain']
        self.wPars          = params['actPars']
        self.wTrain         = params['actTrain']
        self.w_vPars        = params['w_vPars']
        self.w_vTrain       = params['w_vTrain']

        self.agents         = params['agents']
        self.pubs = {}
        for key in self.agents.keys():
            bot             = self.agents[key]
            self.pubs[key]  = rospy.Publisher(bot['pub'], Vector3, queue_size = 1)
        rospy.Subscriber("/finished", Int8, self.receiveDone, queue_size = 1)

        self.valueLoss      = []

        self.manager        = Network(self.mPars, self.mTrain)
        self.m_critic       = Network(self.vPars, self.vTrain) 
        self.m_critic_target= Network(self.vPars, self.vTrain)
        self.worker         = Network(self.wPars, self.wTrain)
        self.w_critic       = Network(self.w_vPars, self.w_vTrain)
        self.w_critic_target= Network(self.w_vPars, self.w_vTrain)

        self.m_discount     = self.vTrain['m_gamma']
        self.w_discount     = self.vTrain['w_gamma']
        self.lr             = self.vTrain['lr']
        self.trainMode      = self.vPars['trainMode']
        self.step           = self.vTrain['step']
        self.stop           = False
        self.c              = self.mTrain['c']
        self.tau            = .005
        self.noise          = Noise(self.manager.neurons[-1], theta = .4, max_sigma = .2, min_sigma = 0, decay = 1)

        self.exp            = Memory()
        self.temp           = []
        self.totalSteps     = 0
        self.soft           = nn.Softmax(dim=1)

        self.reset()

        task.initAgent(self)

        while(not self.stop):
            x = 1+1

        task.postTraining()

    def receiveDone(self, message):
        if message.data  == 1: #all iterations are done. Check manager.py
            self.stop = True
        if message.data == 2: #timed out. Check manager.py
            self.task.restartProtocol(restart = 1)

    def get_action(self, s, s_w = None):
        s = torch.FloatTensor(s)
        if self.iteration % self.c == 0:
            self.goal = self.manager(s)
            noise = torch.FloatTensor(self.noise.get_noise())
            self.goal += noise
        else:
            self.goal = self.prevState[:,:2] + self.goal - s[:,:2]

        self.temp_second = self.temp_first
        self.temp_first = self.goal
        self.prevState = s
        s = s[:,:6]
        inpt = torch.cat((s, self.goal), dim=1)
        policy = self.worker(inpt)
        policy = self.soft(policy)
        choice = np.asscalar(self.choose(policy))
        self.iteration += 1
        return choice #single env

    def choose(self, policies):
        m = Categorical(policies)
        action = m.sample()
        action = action.data.cpu().numpy()
        return action
    
    def saveModel(self):
        pass

    def store(self, s, a, r, sprime, aprime, done):
        if self.temp_second != None:
            self.temp.append(Transition(s, a, r, 1-done, sprime, None, self.temp_second.detach().numpy(), self.goal.detach().numpy()))
            if self.iteration % self.c == 1 and self.iteration != 1: # remember, we push at 1 because we incremented in get_action
                self.temp = Transition(*zip(*self.temp))
                self.exp.push(self.temp) # store into exp
                self.temp = []


    def reset(self):
        self.iteration = 0
        self.temp_first, self.temp_second = (None, None)
        self.prevState = None
        self.temp = []
        return 
    
    def generateSamples(self, goals, states, next_states):
        next_states = next_states[:, :2]
        states = states[:, :2]
        candidates = (next_states - states).unsqueeze(0)
        candidates = torch.cat((candidates, goals.unsqueeze(0)), dim=0)
        normal = Normal(next_states - states, torch.ones(next_states.size()) / 2)
        sampled = normal.sample((8,))
        candidates = torch.cat((candidates, sampled), dim=0)
        # return shape (# candidates, batch_size, dimensions of goal)
        return candidates
    
    def getTransitions(self, initial_goals, states, next_states):
        # initial_goals shape: (# candidates ,batch_size, dimensions of goal)
        # states shape: (batch_size, c, dimensions of state)
        states = states[:,:, :2]
        next_states = next_states[:,:,:2]
        goals = [initial_goals.unsqueeze(0)]
        for c in range(self.c - 1):
            prev = goals[-1].squeeze(0)
            curr = states[:, c, :] + prev - next_states[:,c,:] # broadcast. This should take shape of initial_goals 
            goals.append(curr.unsqueeze(0))
        goals = torch.cat(goals, dim=0)
        goals = goals.transpose(0,1)
        goals = goals.transpose(1,2)
        # return shape (# candidates, batch_size, c, dimensions of goal)
        return goals
    
    def getProbabilities(self, transitions, states, actions):
        # transitions shape (# candidates, batch_size, c, dimensions of goal)
        # states shape: (batch_size, c, dimensions of state)
        # actions shape: (batch_size, c)
        states = states[:, :, :6]
        states = states.unsqueeze(0)
        size = states.size()
        states = states.expand(transitions.size()[0], size[1], size[2], size[3])
        inpt = torch.cat((states, transitions), dim=3)
        soft = nn.Softmax(dim = 3)
        actions = actions.expand(transitions.size()[0], actions.size()[0], actions.size()[1]).unsqueeze(3)
        probs = soft(self.worker(inpt)).gather(3, actions.long()).squeeze(3)
        probs = torch.prod(probs, dim=2)
        # return shape (# candidates, batch_size) of probabilities
        return probs

    def train(self):
        if len(self.exp) > 300:

            groups = self.exp.sample(self.step) # sample groupings of samples
            m_states = torch.cat(map(lambda g: torch.Tensor(g.state[0]), groups), dim=0)
            m_next_states = torch.cat(map(lambda g: torch.Tensor(g.next_state[-1]), groups), dim=0)
            m_goals = torch.cat(map(lambda g: torch.Tensor(g.goal[0]), groups), dim=0) 
            m_rewards = torch.Tensor(map(lambda g: sum(g.reward), groups)).squeeze(2)
            m_masks = torch.Tensor(map(lambda g: g.mask[-1], groups)).unsqueeze(1)

            w_states = torch.cat(map(lambda g: torch.Tensor(g.state).unsqueeze(0), groups), dim=0).squeeze()
            w_next_states = torch.cat(map(lambda g: torch.Tensor(g.next_state).unsqueeze(0), groups), dim=0).squeeze()
            w_actions = torch.cat(map(lambda g: torch.Tensor(g.action).unsqueeze(0), groups), dim=0)

            candidates = self.generateSamples(m_goals, m_states, m_next_states)
            cand_transitions = self.getTransitions(candidates, w_states, w_next_states)
            probs = self.getProbabilities(cand_transitions, w_states, w_actions)
            cand_indices = probs.argmax(dim=0).unsqueeze(0).unsqueeze(2)
            cand_indices = cand_indices.expand(cand_indices.size()[0], cand_indices.size()[1], candidates.size()[2])
            m_goals = candidates.gather(0, cand_indices).squeeze() #size: (batch_size, dimension of goals)
 
            states = []
            actions = []
            next_states = []
            masks = []
            goals = []
            next_goals = []
            for g in groups:
                states.append(torch.Tensor(g.state).squeeze()[:, :6])
                actions.append(torch.Tensor(g.action).squeeze())
                next_states.append(torch.Tensor(g.next_state).squeeze()[:, :6])
                masks.append(torch.Tensor(g.mask).squeeze())
                goals.append(torch.Tensor(g.goal).squeeze())  
                next_goals.append(torch.Tensor(g.next_goal).squeeze())

            states = torch.cat(states, dim=0)
            actions = torch.cat(actions, dim=0).unsqueeze(1)
            next_states = torch.cat(next_states, dim=0)
            masks = torch.cat(masks, dim=0).unsqueeze(1)
            goals = torch.cat(goals, dim=0) 
            next_goals = torch.cat(next_goals, dim=0)
            rewards = -torch.norm(states[:,:2] + goals - next_states[:,:2], dim=1).unsqueeze(1)

            # Manager critic
            q = self.m_critic(torch.cat((m_states, m_goals), dim=1))
            m_next_actions = self.manager(m_next_states)
            q_tar = m_rewards + self.m_discount * self.m_critic_target(torch.cat((m_next_states, m_next_actions), dim=1))
            loss = self.m_critic.get_loss(q, q_tar.detach())
            self.m_critic.optimizer.zero_grad()
            loss.backward()
            self.m_critic.optimizer.step()

            # Manager actor
            new_actions = self.manager(m_states)
            q = self.m_critic(torch.cat((m_states, new_actions), dim=1))
            loss = -q.mean()
            self.manager.optimizer.zero_grad()
            loss.backward()
            self.m_critic.optimizer.step()

            # Worker critic
            q = self.w_critic(torch.cat((states, goals), dim=1)).gather(1, actions.long())
            next_actions = self.worker(torch.cat((next_states, next_goals), dim=1))
            next_actions = self.choose(self.soft(next_actions))
            q_tar = rewards + self.w_discount * masks * self.w_critic_target(torch.cat((next_states, next_goals), dim=1)).gather(1, torch.Tensor(next_actions).long().unsqueeze(1))
            loss = self.w_critic.get_loss(q, q_tar.detach())
            self.w_critic.optimizer.zero_grad()
            loss.backward()
            self.w_critic.optimizer.step()

            # Worker actor 
            new_actions = self.worker(torch.cat((states[:,:6], goals), dim=1))
            policy = self.soft(new_actions)
            new_actions = self.choose(policy)
            q = self.w_critic(torch.cat((states, goals), dim=1))
            q = q.gather(1, torch.Tensor(new_actions).long().unsqueeze(1))
            loss = -q.mean()
            self.worker.optimizer.zero_grad()
            loss.backward()
            self.worker.optimizer.step()
    
            for target_param, param in zip(self.m_critic_target.parameters(), self.m_critic.parameters()):
                target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau)
            for target_param, param in zip(self.w_critic_target.parameters(), self.w_critic.parameters()):
                target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau)
            
            # Push updated replay entires into replay

            for i, goal in enumerate(m_goals):
                curr_group = groups[i]
                curr_goal = goal.unsqueeze(0).detach().numpy()
                inserts = (curr_goal)
                for j in range(self.c - 1):
                    curr_goal = curr_group.state[j][:,:2].reshape(1,-1) + curr_goal - curr_group.next_state[j][:,:2].reshape(1,-1)
                    inserts = inserts + (curr_goal)
                curr_group._replace(goal=inserts)
                self.exp.push(curr_group)

            self.totalSteps += 1

            return loss
class CounterContinuous(object):
    def __init__(self, params, name, task):
        self.name = name
        self.task = task
        self.vTrain = params['valTrain']
        self.vPars = params['valPars']
        self.aTrain = params['actTrain']
        self.aPars = params['actPars']
        self.agents = params['agents']

        self.pubs = {}
        for key in self.agents.keys():
            bot = self.agents[key]
            self.pubs[key] = rospy.Publisher(bot['pub'], Vector3, queue_size=1)
        rospy.Subscriber("/finished", Int8, self.receiveDone, queue_size=1)

        self.valueLoss = []
        self.actorLoss = []

        self.h_state_n = self.aPars['h_state_n']
        self.x_state_n = self.aPars['x_state_n']
        self.u_n = self.aPars['u_n']
        self.clip_grad_norm = self.aTrain['clip']
        self.homogenous = self.aPars['share_params']

        self.critic = Network(self.vPars, self.vTrain).to(device)
        self.target = Network(self.vPars, self.vTrain).to(device)
        if self.homogenous:
            self.actor = SoftNetwork(self.aPars, self.aTrain).to(device)
        else:
            self.actor = [
                SoftNetwork(self.aPars, self.aTrain)
                for i in range(len(self.agents))
            ]

        for target_param, param in zip(self.target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(param.data)

        self.clip_grad_norm = self.aTrain['clip']
        self.trainMode = self.vPars['trainMode']
        self.batch_size = self.vTrain['batch_size']
        self.discount = self.vTrain['gamma']
        self.range = self.aPars['mean_range']
        self.td_lambda = .8
        self.tau = .005
        self.lower_bound = self.aTrain['clamp'][2]
        self.stop = False
        self.trained = False

        self.exp = Memory()

        self.totalSteps = 0

        self.reset()

        task.initAgent(self)

        while (not self.stop):
            x = 1 + 1

        task.postTraining()

    def receiveDone(self, message):
        if message.data == 1:  #all iterations are done. Check manager.py
            self.stop = True
        if message.data == 2:  #timed out. Check manager.py
            self.task.restartProtocol(restart=1)

    def get_action(self, s_true, s_split):
        if self.homogenous:
            a1, log_prob1, z, mu1, log_std1 = self.actor(
                torch.FloatTensor(s_split[0]))
            a2, log_prob2, z, mu2, log_std2 = self.actor(
                torch.FloatTensor(s_split[1]))
        else:  # TODO: Fix this below:
            a1, h_new1, log_prob1, mu1, std1 = self.actor[0](torch.FloatTensor(
                s_split[0]), self.h[0])
            a2, h_new2, log_prob2, mu2, std2 = self.actor[1](torch.FloatTensor(
                s_split[1]), self.h[1])
        action = [a1.detach().numpy().ravel(), a2.detach().numpy().ravel()]
        return [a1, a2]

    def choose(self, policies):
        m = Categorical(policies)
        action = m.sample()
        action = action.data.cpu().numpy()
        return np.asscalar(action)

    def saveModel(self):
        pass

    def store(self, s, a, r, sprime, aprime, done, local, next_local):
        self.exp.push(s, a, r, 1 - done, aprime, sprime, local, next_local)

    def reset(self):
        curr = self.actor.clamp[0]
        if self.trained:
            new = max(self.lower_bound, .05 * self.lower_bound + .95 * curr)
            self.actor.clamp = (new, self.actor.clamp[1], self.lower_bound)
        self.trained = False
        return

    def get_grad_norm(self, model):
        total_norm = 0
        for p in model.parameters():
            if p.grad is None:
                continue
            param_norm = p.grad.data.norm(2)
            total_norm += param_norm.item()**2
        grad_norm = total_norm**(1. / 2)
        return grad_norm

    def get_lambda_targets(self, rewards, mask, gamma, target_qs):
        target_qs = target_qs.squeeze()
        ret = target_qs.new_zeros(*target_qs.shape)
        ret[-1] = target_qs[-1] * mask[-1]

        for t in range(ret.size()[0] - 2, -1, -1):
            ret[t] = self.td_lambda * gamma * ret[t + 1] + \
                mask[t] * (rewards[t] + (1 - self.td_lambda) * gamma * target_qs[t + 1])
        return ret.unsqueeze(1)

    def zipStack(self, data):
        data = zip(*data)
        data = [torch.stack(d).squeeze().to(device) for d in data]
        return data

    def monte_carlo(self, mean, std, n=500):
        # returns tensors representing n sampled from mean and std
        normal = Normal(mean, std)
        return normal.sample((n, ))

    def train(self, episode_done=False):
        if len(self.exp) >= 500:

            transition = self.exp.sample(self.batch_size)
            states = torch.squeeze(torch.Tensor(transition.state)).to(device)
            actions = self.zipStack(transition.action)
            rewards = torch.Tensor(transition.reward).to(device)
            states_next = torch.squeeze(torch.Tensor(
                transition.next_state)).to(device)
            masks = torch.Tensor(transition.mask).to(device)
            local = self.zipStack(transition.local)
            next_local = self.zipStack(transition.next_local)

            actions_next = []
            for s in next_local:
                a, log_prob, _, _, _ = self.actor(s)
                actions_next.append(a.detach())
            inp = torch.cat((states_next, actions_next[0], actions_next[1]),
                            dim=1)
            q_tar = rewards.unsqueeze(
                1) + self.discount * masks.unsqueeze(1) * self.target(inp)
            inp = torch.cat((states, actions[0].detach(), actions[1].detach()),
                            dim=1)
            q = self.critic(inp)
            loss = self.critic.get_loss(q, q_tar.detach())
            self.critic.optimizer.zero_grad()
            loss.backward()
            self.critic.optimizer.step()
            self.valueLoss.append(loss)

            actor_loss = 0
            actions = []
            means = []
            log_stds = []
            log_probs = []
            for s in local:
                a, log_prob, z, mu, log_std = self.actor(s)
                actions.append(a)
                means.append(mu)
                log_stds.append(log_std)
                log_probs.append(log_prob)

            # train first agent
            inp = torch.cat((states, actions[0], actions[1].detach()), dim=1)
            q_out = self.critic(inp)
            samples = self.monte_carlo(means[0], log_stds[0].exp())
            samples = self.range * torch.tanh(samples)
            repeat_s = states.unsqueeze(0)
            repeat_s = repeat_s.expand(samples.size()[0],
                                       repeat_s.size()[1],
                                       repeat_s.size()[2])
            repeat_a = actions[1].unsqueeze(0)
            repeat_a = repeat_a.expand(samples.size()[0],
                                       repeat_a.size()[1],
                                       repeat_a.size()[2])
            inp = torch.cat((repeat_s, samples, repeat_a), dim=2)
            baseline = self.critic(inp).mean(0)
            coma = (q_out - baseline).detach()
            actor_loss -= (log_probs[0].view(coma.size()) * (coma)).mean()

            # train second agent
            inp = torch.cat((states, actions[0].detach(), actions[1]), dim=1)
            q_out = self.critic(inp)
            samples = self.monte_carlo(means[1], log_stds[1].exp())
            samples = self.range * torch.tanh(samples)
            repeat_a = actions[0].unsqueeze(0)
            repeat_a = repeat_a.expand(samples.size()[0],
                                       repeat_a.size()[1],
                                       repeat_a.size()[2])
            inp = torch.cat((repeat_s, repeat_a, samples), dim=2)
            baseline = self.critic(inp).mean(0)
            coma = (q_out - baseline).detach()
            actor_loss -= (log_probs[1].view(coma.size()) * (coma)).mean()

            if self.homogenous:
                self.actor.optimizer.zero_grad()
                actor_loss.backward()
                self.actor.optimizer.step()
            else:
                for actor in self.actor:
                    actor.optimizer.zero_grad()
                actor_loss.backward()
                for actor in self.actor:
                    torch.nn.utils.clip_grad_norm_(actor.parameters(),
                                                   self.clip_grad_norm)
                    actor.optimizer.step()
            self.totalSteps += 1
            self.trained = True

            #UPDATE TARGET NETWORK:
            if self.totalSteps % 50 == 0:
                for target_param, param in zip(self.target.parameters(),
                                               self.critic.parameters()):
                    target_param.data.copy_(param.data)
            return
示例#3
0
class DoubleQ(Agent):
    def __init__(self, params, name, task, load_path=None):
        super(DoubleQ, self).__init__(params, name, task)
        self.dual = self.vPars['dual']
        if self.trainMode:
            if self.dual:
                self.tarNet = DualNetwork(self.vPars, self.vTrain)
                self.valueNet = DualNetwork(self.vPars, self.vTrain)
            else:
                self.tarNet = Network(self.vPars, self.vTrain)
                self.valueNet = Network(self.vPars, self.vTrain)
                for target_param, param in zip(self.tarNet.parameters(),
                                               self.valueNet.parameters()):
                    target_param.data.copy_(param.data)
        else:
            self.valueNet = Network(self.vPars, self.vTrain)
            self.valueNet.load_state_dict(torch.load(load_path))
        self.out_n = self.vPars['neurons'][-1]
        self.replaceCounter = 0
        self.valueLoss = []
        self.avgLoss = 0
        self.expSize = self.vTrain['buffer']
        self.exp = Memory(size=self.expSize)
        self.beta = self.vPars['beta']

        self.priority = self.vTrain['priority']
        self.priorities = []
        self.alpha = .7

        self.double = self.vTrain['double']
        self.update_target_network = self.vTrain['update_target_network_every']
        if 'noise' in self.vTrain:
            self.noise = self.vTrain['noise']
        else:
            self.noise = 0

        task.initAgent(self)

        if not load_path:
            while (not self.stop):
                x = 1 + 1
            task.postTraining()

    def saveModel(self):
        torch.save(
            self.valueNet.state_dict(),
            '/home/jimmy/Documents/Research/AN_Bridging/results/hierarchical_q_policy2.txt'
        )
        pass

    def store(self, s, a, r, sprime, aprime, done):
        self.exp.push(s, a, r, 1 - done, aprime, sprime)
        if len(self.priorities) < self.expSize:
            self.priorities.append(1)
        else:
            self.priorities = self.priorities[1:]
            self.priorities.append(1)

    def get_q(self, s):
        if type(self.valueNet) == list:
            model_index = np.random.randint(len(self.valueNet))
            net = self.valueNet[model_index]
        else:
            net = self.valueNet
        q = net(torch.FloatTensor(s))
        q = q.detach()
        return q

    def get_action(self, s, testing_time=False, probabilistic=False):
        i = np.random.random()
        if i < self.explore and self.trainMode and not testing_time:
            index = np.random.randint(self.out_n)
        else:
            q = self.get_q(s)
            if probabilistic:
                q = q.numpy()
                q = q - np.max(q)
                probs = np.exp(q * self.beta)
                probs = probs / np.sum(probs)
                index = np.random.choice(q.size, p=probs.ravel())
                # print('probability chosen ', probs.ravel()[index])
            else:
                index = np.argmax(q.numpy())
        self.explore = max(.2, self.explore * .9997)
        return index

    def get_q_and_q_tar(self, states, actions, nextStates, rewards, masks):
        qValues = self.valueNet(
            torch.FloatTensor(states).squeeze(1))  #pass in. Processing implied
        q = torch.gather(
            qValues, 1,
            torch.LongTensor(actions).unsqueeze(1))  #get q values of actions
        qnext = self.tarNet(torch.FloatTensor(nextStates))
        qnext = qnext.squeeze(1).detach()  #pass in

        if self.double:
            qNextDouble = self.valueNet(torch.FloatTensor(nextStates))
            qNextDouble = qNextDouble.squeeze(1).detach()  #pass in
            qnext = torch.gather(
                qnext, 1, torch.LongTensor(qNextDouble.argmax(1).unsqueeze(1)))
            qtar = torch.FloatTensor(rewards).squeeze(
                1) + self.discount * torch.Tensor(masks).unsqueeze(1) * qnext
        else:
            qtar = torch.FloatTensor(rewards) + self.discount * torch.Tensor(
                masks).unsqueeze(1) * qnext.max(1)[0].view(
                    self.batch_size, 1)  #calculate target
        return q, qtar

    def train(self, override=False):
        if len(self.exp) >= 500 or override:
            if self.priority:
                loss = 0
                weights = []
                errors = []
                assert len(self.priorities) == len(self.exp)
                for i in range(self.batch_size):
                    probs = np.array(
                        [math.pow(p, self.alpha) for p in self.priorities])
                    probs = probs / np.sum(probs)
                    choice = np.random.choice(len(self.priorities),
                                              p=probs,
                                              size=1)
                    weights.append(
                        math.pow(
                            len(self.priorities) *
                            self.priorities[int(np.asscalar(choice))],
                            -self.beta))
                    states, actions, rewards, masks, _, nextStates, _, _, _ = self.exp.get_transitions(
                        choice)
                    q, qtar = self.get_q_and_q_tar(states, actions, nextStates,
                                                   rewards, masks)
                    td = qtar - q
                    self.priorities[int(np.asscalar(choice))] = abs(td[:, 0])
                    errors.append(self.valueNet.get_loss(q, qtar))
                max_weight = max(weights)
                weights = [w / max_weight for w in weights]
                val_loss = sum([w * e for w, e in zip(weights, errors)])

            else:
                states, actions, rewards, masks, _, nextStates, _, _, _ = self.exp.sample(
                    batch=self.batch_size)

                if self.replaceCounter % self.update_target_network == 0:
                    self.tarNet.load_state_dict(self.valueNet.state_dict())
                    self.replaceCounter = 0

                if self.noise:
                    states = np.array(states)
                    states = states + np.random.normal(0, self.noise,
                                                       states.shape)

                q, qtar = self.get_q_and_q_tar(states, actions, nextStates,
                                               rewards, masks)
                val_loss = self.valueNet.get_loss(q, qtar)

            self.valueNet.optimizer.zero_grad()
            val_loss.backward()
            self.valueNet.optimizer.step()

            self.replaceCounter += 1
            self.totalSteps += 1
            return val_loss
示例#4
0
class SAC(Agent):
    def __init__(self, params, name, task):
        super(SAC, self).__init__(params, name, task)
        self.aPars = params['actPars']
        self.aTrain = params['actTrain']
        self.qPars = params['qPars']
        self.qTrain = params['qTrain']
        if self.trainMode:
            self.QNet = Network(self.qPars, self.qTrain).to(device)
            self.VNet = Network(self.vPars, self.vTrain).to(device)
            self.VTar = Network(self.vPars, self.vTrain).to(device)
            self.policyNet = SoftNetwork(self.aPars, self.aTrain).to(device)
        else:
            print('Not implemented')

        for target_param, param in zip(self.VTar.parameters(),
                                       self.VNet.parameters()):
            target_param.data.copy_(param)

        self.expSize = self.vTrain['buffer']
        self.actions = self.aPars['neurons'][-1]
        self.state = self.aPars['neurons'][0]
        self.exp = ReplayBuffer(self.expSize, self.actions, np.float32,
                                self.state, np.float32)

        task.initAgent(self)

        while (not self.stop):
            x = 1 + 1
        task.postTraining()

    def load_nets(self):
        pass

    def saveModel(self):
        pass

    def get_action(self, s):
        action, _, _, _, _ = self.policyNet(torch.FloatTensor(s))
        action = np.ravel(action.detach().numpy())
        return action

    def send_to_device(self, s, a, r, next_s, d):
        s = torch.FloatTensor(s).to(device)
        a = torch.FloatTensor(a).to(device)
        r = torch.FloatTensor(r).unsqueeze(1).to(device)
        next_s = torch.FloatTensor(next_s).to(device)
        d = torch.FloatTensor(np.float32(d)).unsqueeze(1).to(device)
        return s, a, r, next_s, d

    def train(self):
        if len(self.exp) > 750:
            s, a, r, next_s, d = self.exp.sample_batch(self.batch_size)
            s, a, r, next_s, d = self.send_to_device(s, a, r, next_s, d)

            q = self.QNet(torch.cat([s, a], dim=1))
            v = self.VNet(s)
            new_a, log_prob, z, mean, log_std = self.policyNet(s)

            target_v = self.VTar(next_s)

            next_q = r + (1 - d) * self.discount * target_v
            q_loss = self.QNet.get_loss(q, next_q.detach())

            new_q = self.QNet(torch.cat([s, new_a], dim=1))
            next_v = new_q - log_prob * self.alpha
            v_loss = self.VNet.get_loss(v, next_v.detach())

            target = new_q - v
            actor_loss = (log_prob *
                          (log_prob * self.alpha - target).detach()).mean()

            mean_loss = 1e-3 * mean.pow(2).mean()
            std_loss = 1e-3 * log_std.pow(2).mean()
            actor_loss += mean_loss + std_loss

            self.VNet.optimizer.zero_grad()
            v_loss.backward()
            self.VNet.optimizer.step()

            self.QNet.optimizer.zero_grad()
            q_loss.backward()
            self.QNet.optimizer.step()

            self.policyNet.optimizer.zero_grad()
            actor_loss.backward()
            self.policyNet.optimizer.step()

            for target_param, param in zip(self.VTar.parameters(),
                                           self.VNet.parameters()):
                target_param.data.copy_(target_param.data * (1.0 - 5 * 1e-3) +
                                        param.data * 5 * 1e-3)

            self.totalSteps += 1
示例#5
0
class Counter(object):
    def __init__(self, params, name, task):
        self.name = name
        self.task = task
        self.vTrain = params['valTrain']
        self.vPars = params['valPars']
        self.aTrain = params['actTrain']
        self.aPars = params['actPars']
        self.agents = params['agents']

        self.pubs = OrderedDict()
        for key in self.agents.keys():
            bot = self.agents[key]
            self.pubs[key] = rospy.Publisher(bot['pub'], Vector3, queue_size=1)
        rospy.Subscriber("/finished", Int8, self.receiveDone, queue_size=1)

        self.valueLoss = []
        self.actorLoss = []

        self.h_state_n = self.aPars['h_state_n']
        self.x_state_n = self.aPars['x_state_n']
        self.u_n = self.aPars['u_n']
        self.clip_grad_norm = self.aTrain['clip']
        self.homogenous = self.aPars['share_params']

        self.critic = Network(self.vPars, self.vTrain).to(device)
        self.target = Network(self.vPars, self.vTrain).to(device)
        if self.homogenous:
            self.actor = CounterActor(self.aPars, self.aTrain).to(device)
        else:
            self.actor = [
                CounterActor(self.aPars, self.aTrain)
                for i in range(len(self.agents))
            ]

        for target_param, param in zip(self.target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(param.data)

        self.clip_grad_norm = self.aTrain['clip']
        self.trainMode = self.vPars['trainMode']
        self.batch_size = self.vTrain['batch']
        self.discount = self.vTrain['gamma']
        self.temp_second = None
        self.temp_first = None
        self.td_lambda = 0  # TEST: this is because we are doing ER off-policy
        self.tau = .01
        self.stop = False
        self.trained = False

        self.exp = Memory()

        self.totalSteps = 0

        self.reset()

        task.initAgent(self)

        while (not self.stop):
            x = 1 + 1

        task.postTraining()

    def receiveDone(self, message):
        if message.data == 1:  #all iterations are done. Check manager.py
            self.stop = True
        if message.data == 2:  #timed out. Check manager.py
            self.task.restartProtocol(restart=1)

    def get_action(self, s_true, s_split):
        if self.homogenous:
            policy1 = self.actor(torch.FloatTensor(s_split[0]))
            a1 = np.asscalar(self.choose(policy1))
            policy2 = self.actor(torch.FloatTensor(s_split[1]))
            a2 = np.asscalar(self.choose(policy2))
        else:
            policy1 = self.actor[0](torch.FloatTensor(s_split[0]))
            a1 = self.choose(policy1)
            policy2 = self.actor[1](torch.FloatTensor(s_split[1]))
            a2 = self.choose(policy2)
        # THIS IS A TEST
        a1 = 0
        #print(policy1)
        #print(policy2)
        #print('')
        return [a1, a2]

    def choose(self, policies):
        m = Categorical(policies)
        action = m.sample()
        action = action.data.cpu().numpy()
        return action

    def saveModel(self):
        pass

    def store(self, s, a, r, sprime, aprime, done, local, next_local):
        self.exp.push(s, a, r, 1 - done, aprime, sprime, local, next_local,
                      None)

    def reset(self):
        self.train(True)
        if self.trained:
            self.actor.eps = max(.05, self.actor.eps - .003)
        self.trained = False
        self.temp_first, self.temp_second = (None, None)
        self.h = [
            torch.zeros((1, 1, self.h_state_n))
            for i in range(len(self.agents))
        ]
        self.prevAction = [-1, -1]
        return

    def zipStack(self, data):
        data = zip(*data)
        data = [torch.stack(d).squeeze().to(device) for d in data]
        return data

    def get_lambda_targets(self, rewards, mask, gamma, target_qs):
        target_qs = target_qs.squeeze()
        ret = target_qs.new_zeros(*target_qs.shape)
        ret[-1] = rewards[-1] + target_qs[-1] * mask[-1]

        for t in range(ret.size()[0] - 2, -1, -1):
            ret[t] = mask[t] * (self.td_lambda * gamma * ret[t + 1]) + (
                rewards[t] +
                (1 - self.td_lambda) * gamma * target_qs[t] * mask[t])
        return ret.unsqueeze(1)

    def train(self, episode_done=False):
        if len(self.exp) > self.batch_size:
            transition = self.exp.sample(self.batch_size)
            states = torch.squeeze(torch.Tensor(transition.state)).to(device)
            states_next = torch.squeeze(torch.Tensor(
                transition.next_state)).to(device)
            actions = torch.Tensor(transition.action).float().to(device)
            rewards = torch.Tensor(transition.reward).to(device)
            masks = torch.Tensor(transition.mask).to(device)
            local = self.zipStack(transition.local)
            next_local = self.zipStack(transition.next_local)

            actions_next = []
            for s in next_local:
                next_policy = self.actor(s)
                next_action = self.choose(next_policy)
                actions_next.append(torch.Tensor(next_action))
            '''# Critic Update
            ID = torch.Tensor(states.size()[0], 1).fill_(-1)
            inp = torch.cat((states_next, actions_next[1].unsqueeze(1), ID), dim = 1)
            q_tar = self.target(inp).detach().gather(1, actions_next[0].long().unsqueeze(1))
            q_tar = self.get_lambda_targets(rewards.squeeze(), masks.squeeze(), self.discount, q_tar)
            inp = torch.cat((states, actions[:, 1].unsqueeze(1), ID), dim = 1)
            q = self.critic(inp)
            q = q.gather(1, actions[:, 0].long().unsqueeze(1))
            loss = self.critic.get_loss(q, q_tar)
            self.critic.optimizer.zero_grad()
            loss.backward()
            self.critic.optimizer.step()'''

            ID = torch.Tensor(states.size()[0], 1).fill_(1)
            inp = torch.cat((states_next, actions_next[0].unsqueeze(1), ID),
                            dim=1)
            q_tar = self.target(inp).detach().gather(
                1, actions_next[1].long().unsqueeze(1))  # .max(1)?
            q_tar = self.get_lambda_targets(rewards.squeeze(), masks.squeeze(),
                                            self.discount, q_tar)
            inp = torch.cat((states, actions[:, 0].unsqueeze(1), ID), dim=1)
            q = self.critic(inp)
            q = q.gather(1, actions[:, 1].long().unsqueeze(1))
            loss = self.critic.get_loss(q, q_tar)
            self.critic.optimizer.zero_grad()
            loss.backward()
            self.critic.optimizer.step()

            actor_loss = 0
            # Actor Update. Consider doing new_actions
            policies = []
            new_actions = []
            for s in local:
                policy = self.actor(s)
                policies.append(policy)
                new_action = self.choose(policy)
                new_actions.append(torch.Tensor(new_action))
            '''ID = torch.Tensor(states.size()[0], 1).fill_(-1)
            inp = torch.cat((states, new_actions[1].unsqueeze(1), ID), dim = 1)
            q_out = self.critic(inp) #batch x num_actions
            policy = policies[0] #batch x num_actions
            mult = q_out * policy
            baseline = torch.sum(mult, 1).unsqueeze(1)
            q_taken = q_out.gather(1, new_actions[0].long().unsqueeze(1))
            coma = (q_taken - baseline).detach()
            probs_taken = policy.gather(1, new_actions[0].long().unsqueeze(1))
            loss = -(torch.log(probs_taken) * coma).mean()
            actor_loss += loss '''

            ID = torch.Tensor(states.size()[0], 1).fill_(1)
            inp = torch.cat((states, new_actions[0].unsqueeze(1), ID), dim=1)
            q_out = self.critic(inp)  #batch x num_actions
            policy = policies[1]  #batch x num_actions
            mult = q_out * policy
            baseline = torch.sum(mult, 1).unsqueeze(1)
            q_taken = q_out.gather(1, new_actions[1].long().unsqueeze(1))
            coma = (q_taken - baseline).detach()
            probs_taken = policy.gather(1, new_actions[1].long().unsqueeze(1))
            loss = -(torch.log(probs_taken) * coma).mean()
            actor_loss += loss

            self.actorLoss.append(actor_loss)

            if self.homogenous:
                self.actor.optimizer.zero_grad()
                actor_loss.backward()
                nn.utils.clip_grad_norm_(self.actor.parameters(), 1)
                self.actor.optimizer.step()
            else:
                for actor in self.actor:
                    actor.optimizer.zero_grad()
                actor_loss.backward()
                for actor in self.actor:
                    actor.optimizer.step()

            self.totalSteps += 1
            # self.exp = Memory()
            self.trained = True

            #UPDATE TARGET NETWORK:
            if self.totalSteps % 1 == 0:  # THIS IS A TEST
                for target_param, param in zip(self.target.parameters(),
                                               self.critic.parameters()):
                    target_param.data.copy_((1 - self.tau) * target_param +
                                            self.tau * param.data)

            return
示例#6
0
class DoubleQ(Agent):
    def __init__(self, params, name, task, load_path=None):
        super(DoubleQ, self).__init__(params, name, task)
        self.dual = self.vPars['dual']
        if self.trainMode:
            if self.dual:
                self.tarNet = DualNetwork(self.vPars, self.vTrain)
                self.valueNet = DualNetwork(self.vPars, self.vTrain)
            else:
                self.tarNet = Network(self.vPars, self.vTrain)
                self.valueNet = Network(self.vPars, self.vTrain)
                for target_param, param in zip(self.tarNet.parameters(),
                                               self.valueNet.parameters()):
                    target_param.data.copy_(param.data)
        else:
            self.valueNet = Network(self.vPars, self.vTrain)
            paths = [
                '/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/push_in_hole.txt',
                '/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/push_in_hole2.txt'
            ]
            if not load_path:
                self.valueNet = []
                for path in paths:
                    self.valueNet.append(Network(self.vPars, self.vTrain))
                    self.valueNet[-1].load_state_dict(torch.load(path))
            else:
                self.valueNet.load_state_dict(torch.load(load_path))
        self.out_n = self.vPars['neurons'][-1]
        self.replaceCounter = 0
        self.valueLoss = []
        self.avgLoss = 0
        self.expSize = self.vTrain['buffer']
        self.exp = Memory(size=self.expSize)
        self.double = self.vTrain['double']

        task.initAgent(self)

        if not load_path:
            while (not self.stop):
                x = 1 + 1
            task.postTraining()

    def saveModel(self):
        torch.save(
            self.valueNet.state_dict(),
            '/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/box_push_hierarchical_q_policy.txt'
        )
        pass

    def store(self, s, a, r, sprime, aprime, done):
        self.exp.push(s, a, r, 1 - done, aprime, sprime)

    def get_action(self, s):
        i = np.random.random()
        if i < self.explore and self.trainMode:
            index = np.random.randint(self.out_n)
        else:
            if type(self.valueNet) == list:
                model_index = np.random.randint(len(self.valueNet))
                net = self.valueNet[model_index]
            else:
                net = self.valueNet
            q = net(torch.FloatTensor(s))
            #print(q)
            q = q.detach()
            index = np.argmax(q.numpy())
        self.explore = max(.1, self.explore * .9997)
        return index

    def train(self):
        if len(self.exp) >= 500:
            states, actions, rewards, masks, _, nextStates, _, _, _ = self.exp.sample(
                batch=self.batch_size)

            if self.replaceCounter % 500 == 0:  # THIS IS SET TO 200 FOR BOX PUSH TASK...SLOPE IS 500
                self.tarNet.load_state_dict(self.valueNet.state_dict())
                self.replaceCounter = 0

            qValues = self.valueNet(torch.FloatTensor(states).squeeze(
                1))  #pass in. Processing implied
            q = torch.gather(qValues, 1,
                             torch.LongTensor(actions).unsqueeze(
                                 1))  #get q values of actions
            qnext = self.tarNet(torch.FloatTensor(nextStates))
            qnext = qnext.squeeze(1).detach()  #pass in

            if self.double:
                qNextDouble = self.valueNet(torch.FloatTensor(nextStates))
                qNextDouble = qNextDouble.squeeze(1).detach()  #pass in
                qnext = torch.gather(
                    qnext, 1,
                    torch.LongTensor(qNextDouble.argmax(1).unsqueeze(1)))
                qtar = torch.FloatTensor(rewards).squeeze(
                    1
                ) + self.discount * torch.Tensor(masks).unsqueeze(1) * qnext
            else:
                qtar = torch.FloatTensor(
                    rewards) + self.discount * torch.Tensor(
                        masks).unsqueeze(1) * qnext.max(1)[0].view(
                            self.batch_size, 1)  #calculate target

            val_loss = self.valueNet.get_loss(q, qtar)
            self.valueNet.optimizer.zero_grad()
            val_loss.backward()
            self.valueNet.optimizer.step()

            self.replaceCounter += 1
            self.totalSteps += 1