Пример #1
0
class DoubleQ(Agent):
    def __init__(self, params, name, task, load_path=None):
        super(DoubleQ, self).__init__(params, name, task)
        self.dual = self.vPars['dual']
        if self.trainMode:
            if self.dual:
                self.tarNet = DualNetwork(self.vPars, self.vTrain)
                self.valueNet = DualNetwork(self.vPars, self.vTrain)
            else:
                self.tarNet = Network(self.vPars, self.vTrain)
                self.valueNet = Network(self.vPars, self.vTrain)
                for target_param, param in zip(self.tarNet.parameters(),
                                               self.valueNet.parameters()):
                    target_param.data.copy_(param.data)
        else:
            self.valueNet = Network(self.vPars, self.vTrain)
            self.valueNet.load_state_dict(torch.load(load_path))
        self.out_n = self.vPars['neurons'][-1]
        self.replaceCounter = 0
        self.valueLoss = []
        self.avgLoss = 0
        self.expSize = self.vTrain['buffer']
        self.exp = Memory(size=self.expSize)
        self.beta = self.vPars['beta']

        self.priority = self.vTrain['priority']
        self.priorities = []
        self.alpha = .7

        self.double = self.vTrain['double']
        self.update_target_network = self.vTrain['update_target_network_every']
        if 'noise' in self.vTrain:
            self.noise = self.vTrain['noise']
        else:
            self.noise = 0

        task.initAgent(self)

        if not load_path:
            while (not self.stop):
                x = 1 + 1
            task.postTraining()

    def saveModel(self):
        torch.save(
            self.valueNet.state_dict(),
            '/home/jimmy/Documents/Research/AN_Bridging/results/hierarchical_q_policy2.txt'
        )
        pass

    def store(self, s, a, r, sprime, aprime, done):
        self.exp.push(s, a, r, 1 - done, aprime, sprime)
        if len(self.priorities) < self.expSize:
            self.priorities.append(1)
        else:
            self.priorities = self.priorities[1:]
            self.priorities.append(1)

    def get_q(self, s):
        if type(self.valueNet) == list:
            model_index = np.random.randint(len(self.valueNet))
            net = self.valueNet[model_index]
        else:
            net = self.valueNet
        q = net(torch.FloatTensor(s))
        q = q.detach()
        return q

    def get_action(self, s, testing_time=False, probabilistic=False):
        i = np.random.random()
        if i < self.explore and self.trainMode and not testing_time:
            index = np.random.randint(self.out_n)
        else:
            q = self.get_q(s)
            if probabilistic:
                q = q.numpy()
                q = q - np.max(q)
                probs = np.exp(q * self.beta)
                probs = probs / np.sum(probs)
                index = np.random.choice(q.size, p=probs.ravel())
                # print('probability chosen ', probs.ravel()[index])
            else:
                index = np.argmax(q.numpy())
        self.explore = max(.2, self.explore * .9997)
        return index

    def get_q_and_q_tar(self, states, actions, nextStates, rewards, masks):
        qValues = self.valueNet(
            torch.FloatTensor(states).squeeze(1))  #pass in. Processing implied
        q = torch.gather(
            qValues, 1,
            torch.LongTensor(actions).unsqueeze(1))  #get q values of actions
        qnext = self.tarNet(torch.FloatTensor(nextStates))
        qnext = qnext.squeeze(1).detach()  #pass in

        if self.double:
            qNextDouble = self.valueNet(torch.FloatTensor(nextStates))
            qNextDouble = qNextDouble.squeeze(1).detach()  #pass in
            qnext = torch.gather(
                qnext, 1, torch.LongTensor(qNextDouble.argmax(1).unsqueeze(1)))
            qtar = torch.FloatTensor(rewards).squeeze(
                1) + self.discount * torch.Tensor(masks).unsqueeze(1) * qnext
        else:
            qtar = torch.FloatTensor(rewards) + self.discount * torch.Tensor(
                masks).unsqueeze(1) * qnext.max(1)[0].view(
                    self.batch_size, 1)  #calculate target
        return q, qtar

    def train(self, override=False):
        if len(self.exp) >= 500 or override:
            if self.priority:
                loss = 0
                weights = []
                errors = []
                assert len(self.priorities) == len(self.exp)
                for i in range(self.batch_size):
                    probs = np.array(
                        [math.pow(p, self.alpha) for p in self.priorities])
                    probs = probs / np.sum(probs)
                    choice = np.random.choice(len(self.priorities),
                                              p=probs,
                                              size=1)
                    weights.append(
                        math.pow(
                            len(self.priorities) *
                            self.priorities[int(np.asscalar(choice))],
                            -self.beta))
                    states, actions, rewards, masks, _, nextStates, _, _, _ = self.exp.get_transitions(
                        choice)
                    q, qtar = self.get_q_and_q_tar(states, actions, nextStates,
                                                   rewards, masks)
                    td = qtar - q
                    self.priorities[int(np.asscalar(choice))] = abs(td[:, 0])
                    errors.append(self.valueNet.get_loss(q, qtar))
                max_weight = max(weights)
                weights = [w / max_weight for w in weights]
                val_loss = sum([w * e for w, e in zip(weights, errors)])

            else:
                states, actions, rewards, masks, _, nextStates, _, _, _ = self.exp.sample(
                    batch=self.batch_size)

                if self.replaceCounter % self.update_target_network == 0:
                    self.tarNet.load_state_dict(self.valueNet.state_dict())
                    self.replaceCounter = 0

                if self.noise:
                    states = np.array(states)
                    states = states + np.random.normal(0, self.noise,
                                                       states.shape)

                q, qtar = self.get_q_and_q_tar(states, actions, nextStates,
                                               rewards, masks)
                val_loss = self.valueNet.get_loss(q, qtar)

            self.valueNet.optimizer.zero_grad()
            val_loss.backward()
            self.valueNet.optimizer.step()

            self.replaceCounter += 1
            self.totalSteps += 1
            return val_loss
Пример #2
0
class Twin_DDPG(Agent):
    def __init__(self, params, name, task):
        super(Twin_DDPG, self).__init__(params, name, task)
        self.aPars = params['actPars']
        self.aTrain = params['actTrain']

        if self.trainMode:
            self.values = [
                Network(self.vPars, self.vTrain),
                Network(self.vPars, self.vTrain)
            ]
            self.policyNet = TD3Network(self.aPars, self.aTrain)
            self.tarPolicy = TD3Network(self.aPars, self.aTrain)

            if self.load:
                self.load_nets()

            self.tarPolicy.load_state_dict(self.policyNet.state_dict())
            self.tar = [
                Network(self.vPars, self.vTrain),
                Network(self.vPars, self.vTrain)
            ]
            for i in range(len(self.values)):
                self.tar[i].load_state_dict(self.values[i].state_dict())
        else:
            self.policyNet = Network(self.aPars, self.aTrain)
            self.policyNet.load_state_dict(
                torch.load(
                    "/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/TD3_goal_policy2.txt"
                ))

        self.base = self.vTrain['baseExplore']
        self.step = self.vTrain['decay']
        self.expSize = self.vTrain['buffer']
        self.exp = Replay(self.expSize)
        self.a = self.vTrain['a']
        self.tau = self.vPars['tau']
        self.smooth = self.vTrain['smooth']
        self.clip = self.vTrain['clip']
        self.delay = self.vTrain['policy_delay']
        self.mean_range = self.aPars['mean_range']
        self.noise = OUNoise(self.out_n,
                             mu=0,
                             theta=.15,
                             max_sigma=self.explore,
                             min_sigma=self.base,
                             decay=self.step)
        self.valueLoss = []
        self.actorLoss = []
        self.avgLoss = 0
        self.avgActLoss = 0

        task.initAgent(self)

        while (not self.stop):
            x = 1 + 1
        task.postTraining()

    def load_nets(self):
        path = "/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/TD3_goal3_"
        self.policyNet.load_state_dict(torch.load(path + "policy.txt"))
        self.values[0].load_state_dict(torch.load(path + "Qvalue1.txt"))
        self.values[1].load_state_dict(torch.load(path + "Qvalue2.txt"))

    def saveModel(self):
        path = "/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/TD3_goal3_"
        torch.save(self.policyNet.state_dict(), path + "policy.txt")
        torch.save(self.values[0].state_dict(), path + "Qvalue1.txt")
        torch.save(self.values[1].state_dict(), path + "Qvalue2.txt")
        print("Network saved")
        pass

    def get_action(self):
        output = self.policyNet(torch.FloatTensor(s))
        i = np.random.random()
        if i < self.explore[0]:
            #add in exploration TODO: put in OU noise
            noise = torch.from_numpy(np.random.normal(0, self.explore[1], 2))
            output = output + noise
        output = output.float()
        return output[0]

    def train(self):
        if self.dataSize > 500 and self.trainMode:
            #iteration updates
            self.trainIt += 1
            self.totalSteps += 1

            #Unpack
            s, a, r, n_s, n_a, done = self.exp.get_data()
            noise = torch.FloatTensor(
                np.random.normal(0, self.smooth, n_a.shape))

            c = np.random.choice(min(self.dataSize, self.expSize),
                                 self.batch_size)

            s = torch.FloatTensor(s[c])
            a = torch.FloatTensor(a[c])
            r = torch.FloatTensor(r[c])
            n_s = torch.FloatTensor(n_s[c])
            done = torch.FloatTensor(done[c])
            n_a = self.tarPolicy(n_s).detach().numpy()

            #target policy smoothing
            n_a_ = n_a + torch.clamp(noise, -self.clip, self.clip)
            n_sa = torch.cat((n_s, n_a), dim=1)
            qtar = torch.FloatTensor(r) + self.discount * (
                1 - done) * torch.min(self.tar[0](n_sa).detach(), self.tar[1]
                                      (n_sa).detach())  #pass in

            #Value update
            sa = torch.cat((s, a), dim=1)
            for qnet in self.values:
                q = qnet(sa)
                loss = qnet.loss_fnc(q, qtar)
                qnet.optimizer.zero_grad()
                loss.backward()
                qnet.optimizer.step()
                qnet.scheduler.step()
                self.avgLoss += loss / len(self.values)

            #policy update
            if self.trainIt % self.delay == 0:
                act = self.policyNet(s)
                s_a = torch.cat((s, act), 1)
                q = self.values[0](s_a)
                policy_loss = -q.mean()

                self.policyNet.optimizer.zero_grad()
                policy_loss.backward()
                self.policyNet.optimizer.step()
                self.policyNet.scheduler.step()
                self.avgActLoss += policy_loss

                for target_param, param in zip(self.tarPolicy.parameters(),
                                               self.policyNet.parameters()):
                    target_param.data.copy_(self.tau * param.data +
                                            (1.0 - self.tau) *
                                            target_param.data)

                for i in range(len(self.values)):
                    for target_param, param in zip(
                            self.tar[i].parameters(),
                            self.values[i].parameters()):
                        target_param.data.copy_(self.tau * param.data +
                                                (1.0 - self.tau) *
                                                target_param.data)
Пример #3
0
class DoubleQ(Agent):
    def __init__(self, params, name, task, load_path=None):
        super(DoubleQ, self).__init__(params, name, task)
        self.dual = self.vPars['dual']
        if self.trainMode:
            if self.dual:
                self.tarNet = DualNetwork(self.vPars, self.vTrain)
                self.valueNet = DualNetwork(self.vPars, self.vTrain)
            else:
                self.tarNet = Network(self.vPars, self.vTrain)
                self.valueNet = Network(self.vPars, self.vTrain)
                for target_param, param in zip(self.tarNet.parameters(),
                                               self.valueNet.parameters()):
                    target_param.data.copy_(param.data)
        else:
            self.valueNet = Network(self.vPars, self.vTrain)
            paths = [
                '/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/push_in_hole.txt',
                '/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/push_in_hole2.txt'
            ]
            if not load_path:
                self.valueNet = []
                for path in paths:
                    self.valueNet.append(Network(self.vPars, self.vTrain))
                    self.valueNet[-1].load_state_dict(torch.load(path))
            else:
                self.valueNet.load_state_dict(torch.load(load_path))
        self.out_n = self.vPars['neurons'][-1]
        self.replaceCounter = 0
        self.valueLoss = []
        self.avgLoss = 0
        self.expSize = self.vTrain['buffer']
        self.exp = Memory(size=self.expSize)
        self.double = self.vTrain['double']

        task.initAgent(self)

        if not load_path:
            while (not self.stop):
                x = 1 + 1
            task.postTraining()

    def saveModel(self):
        torch.save(
            self.valueNet.state_dict(),
            '/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/box_push_hierarchical_q_policy.txt'
        )
        pass

    def store(self, s, a, r, sprime, aprime, done):
        self.exp.push(s, a, r, 1 - done, aprime, sprime)

    def get_action(self, s):
        i = np.random.random()
        if i < self.explore and self.trainMode:
            index = np.random.randint(self.out_n)
        else:
            if type(self.valueNet) == list:
                model_index = np.random.randint(len(self.valueNet))
                net = self.valueNet[model_index]
            else:
                net = self.valueNet
            q = net(torch.FloatTensor(s))
            #print(q)
            q = q.detach()
            index = np.argmax(q.numpy())
        self.explore = max(.1, self.explore * .9997)
        return index

    def train(self):
        if len(self.exp) >= 500:
            states, actions, rewards, masks, _, nextStates, _, _, _ = self.exp.sample(
                batch=self.batch_size)

            if self.replaceCounter % 500 == 0:  # THIS IS SET TO 200 FOR BOX PUSH TASK...SLOPE IS 500
                self.tarNet.load_state_dict(self.valueNet.state_dict())
                self.replaceCounter = 0

            qValues = self.valueNet(torch.FloatTensor(states).squeeze(
                1))  #pass in. Processing implied
            q = torch.gather(qValues, 1,
                             torch.LongTensor(actions).unsqueeze(
                                 1))  #get q values of actions
            qnext = self.tarNet(torch.FloatTensor(nextStates))
            qnext = qnext.squeeze(1).detach()  #pass in

            if self.double:
                qNextDouble = self.valueNet(torch.FloatTensor(nextStates))
                qNextDouble = qNextDouble.squeeze(1).detach()  #pass in
                qnext = torch.gather(
                    qnext, 1,
                    torch.LongTensor(qNextDouble.argmax(1).unsqueeze(1)))
                qtar = torch.FloatTensor(rewards).squeeze(
                    1
                ) + self.discount * torch.Tensor(masks).unsqueeze(1) * qnext
            else:
                qtar = torch.FloatTensor(
                    rewards) + self.discount * torch.Tensor(
                        masks).unsqueeze(1) * qnext.max(1)[0].view(
                            self.batch_size, 1)  #calculate target

            val_loss = self.valueNet.get_loss(q, qtar)
            self.valueNet.optimizer.zero_grad()
            val_loss.backward()
            self.valueNet.optimizer.step()

            self.replaceCounter += 1
            self.totalSteps += 1
Пример #4
0
class TRPOAgent(Agent):
    def __init__(self, params, name, task):
        super(TRPOAgent, self).__init__(params, name, task)
        self.valueNet = Network(self.vPars, self.vTrain)
        self.policyNet = Network(params['actPars'], params['actTrain'])
        self.running_state = ZFilter((self.vPars['in_n'], ), clip=5)
        self.running_reward = ZFilter((1, ), demean=False, clip=10)
        self.experience = Memory()
        self.valueLoss = []
        self.actorLoss = []
        self.avgLoss = 0
        task.initAgent(self)
        while (not self.stop):
            x = 1 + 1
        task.postTraining()

    def saveModel(self):
        torch.save(
            self.valueNet.state_dict(),
            "/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/TRPOCritic.txt"
        )
        torch.save(
            self.policyNet.state_dict(),
            "/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/TRPOPolicy.txt"
        )
        print("Network saved")

    def train(self):
        batch = self.experience.sample()
        self.update_params(batch)

    def store(self, prevS, prevA, r, s, a, failure):
        mask = 0 if failure == 1 else 1
        self.experience.push(prevS, prevA, mask, s, r)

    def update_params(self, batch):
        rewards = torch.Tensor(batch.reward)
        masks = torch.Tensor(batch.mask)
        actions = torch.Tensor(np.concatenate(batch.action, 0))
        states = torch.Tensor(batch.state)
        values = self.valueNet(Variable(states))

        returns = torch.Tensor(actions.size(0), 1)
        deltas = torch.Tensor(actions.size(0), 1)
        advantages = torch.Tensor(actions.size(0), 1)

        prev_return = 0
        prev_value = 0
        prev_advantage = 0
        for i in reversed(range(rewards.size(0))):
            returns[i] = rewards[i] + self.discount * prev_return * masks[i]
            deltas[i] = rewards[
                i] + self.discount * prev_value * masks[i] - values.data[i]
            advantages[i] = deltas[
                i] + self.discount * tau * prev_advantage * masks[i]

            prev_return = returns[i, 0]
            prev_value = values.data[i, 0]
            prev_advantage = advantages[i, 0]

        targets = Variable(returns)

        # Original code uses the same LBFGS to optimize the value loss
        def get_value_loss(flat_params):
            set_flat_params_to(self.valueNet, torch.Tensor(flat_params))
            for param in self.valueNet.parameters():
                if param.grad is not None:
                    param.grad.data.fill_(0)

            values_ = self.valueNet(Variable(states))

            value_loss = (values_ - targets).pow(2).mean()

            # weight decay
            for param in self.valueNet.parameters():
                value_loss += param.pow(2).sum() * l2Reg
            value_loss.backward()
            return (value_loss.data.double().numpy(),
                    get_flat_grad_from(self.valueNet).data.double().numpy())

        flat_params, _, opt_info = scipy.optimize.fmin_l_bfgs_b(
            get_value_loss,
            get_flat_params_from(self.valueNet).double().numpy(),
            maxiter=25)
        set_flat_params_to(self.valueNet, torch.Tensor(flat_params))

        advantages = (advantages - advantages.mean()) / advantages.std()

        output = self.policyNet(Variable(states)).view(-1, self.u_n * 2)
        action_means = output.narrow(1, 0, self.u_n)
        action_log_stds = output.narrow(1, self.u_n, self.u_n)
        action_stds = torch.exp(action_log_stds)

        fixed_log_prob = normal_log_density(Variable(actions), action_means,
                                            action_log_stds,
                                            action_stds).data.clone()

        def get_loss(volatile=False):
            if volatile:
                with torch.no_grad():
                    output = self.policyNet(Variable(states))
            else:
                output = self.policyNet(Variable(states))

            output = output.view(-1, self.u_n * 2)
            action_means = output.narrow(1, 0, self.u_n)
            action_log_stds = output.narrow(1, self.u_n, self.u_n)
            action_stds = torch.exp(action_log_stds)

            log_prob = normal_log_density(Variable(actions), action_means,
                                          action_log_stds, action_stds)
            action_loss = -Variable(advantages) * torch.exp(
                log_prob - Variable(fixed_log_prob))
            return action_loss.mean()

        def get_kl():
            output = self.policyNet(Variable(states))
            output = output.view(-1, self.u_n * 2)
            mean1 = output.narrow(1, 0, self.u_n)
            log_std1 = output.narrow(1, self.u_n, self.u_n)
            std1 = torch.exp(action_log_stds)

            mean0 = Variable(mean1.data)
            log_std0 = Variable(log_std1.data)
            std0 = Variable(std1.data)
            kl = log_std1 - log_std0 + (std0.pow(2) +
                                        (mean0 - mean1).pow(2)) / (
                                            2.0 * std1.pow(2)) - 0.5
            return kl.sum(1, keepdim=True)

        loss = trpo_step(self.policyNet, get_loss, get_kl, maxKL, damping)
        self.avgLoss += loss
        self.trainIt += 1