def surrogate_loss(self, episodes, old_pis=None):
        losses, kls, pis = [], [], []
        if old_pis is None:
            old_pis = [None] * len(episodes)

        for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis):
            if self.usePPO:
                params, grad_norm = self.adapt_ppo(train_episodes)
            else:
                params = self.adapt(train_episodes)
            self.logger.info("in surrogate_loss")
            with torch.set_grad_enabled(old_pi is None):
                if self.baseline_type == 'critic shared':
                    pi, _ = self.policy(valid_episodes.observations,
                                        params=params)
                pi = self.policy(valid_episodes.observations, params=params)
                pis.append(detach_distribution(pi))

                if old_pi is None:
                    old_pi = detach_distribution(pi)

                if self.baseline_type == 'linear':
                    values = self.baseline(valid_episodes)
                elif self.baseline_type == 'critic separate':
                    values = self.baseline(valid_episodes.observations)
                elif self.baseline_type == 'critic shared':
                    _, values = self.policy(valid_episodes.observations,
                                            params=params)

                advantages = valid_episodes.gae(values, tau=self.tau)
                advantages = weighted_normalize(advantages,
                                                weights=valid_episodes.mask)

                log_ratio = (pi.log_prob(valid_episodes.actions) -
                             old_pi.log_prob(valid_episodes.actions))
                if log_ratio.dim() > 2:
                    log_ratio = torch.sum(log_ratio, dim=2)
                ratio = torch.exp(log_ratio)

                loss = -weighted_mean(
                    ratio * advantages, dim=0, weights=valid_episodes.mask)
                losses.append(loss)

                mask = valid_episodes.mask
                if valid_episodes.actions.dim() > 2:
                    mask = mask.unsqueeze(2)
                kl = weighted_mean(kl_divergence(pi, old_pi),
                                   dim=0,
                                   weights=mask)
                kls.append(kl)

        return (torch.mean(torch.stack(losses, dim=0)),
                torch.mean(torch.stack(kls, dim=0)), pis)
示例#2
0
    def surrogate_loss(self, episodes, old_pis=None):
        """
        Using TRPO.

        old_pis are not None only when doing line search?
        How are old_pis used? Like the behavior policy in TRPO? How?
        """
        losses, kls, pis = [], [], []
        if old_pis is None:
            old_pis = [None] * len(episodes)

        for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis):
            # adapt our policy network to a new task
            params = self.adapt(train_episodes)
            # doing learning only when old_pi is None?
            with torch.set_grad_enabled(old_pi is None):
                pi = self.policy(valid_episodes.observations, params=params)
                # the set of policies adapted to each task
                pis.append(detach_distribution(pi))

                if old_pi is None:
                    old_pi = detach_distribution(pi)

                values = self.baseline(valid_episodes)
                advantages = valid_episodes.gae(values, tau=self.tau)
                advantages = weighted_normalize(advantages,
                                                weights=valid_episodes.mask)

                log_ratio = (pi.log_prob(valid_episodes.actions) -
                             old_pi.log_prob(valid_episodes.actions))
                if log_ratio.dim() > 2:
                    log_ratio = torch.sum(log_ratio, dim=2)
                ratio = torch.exp(log_ratio)

                loss = -weighted_mean(
                    ratio * advantages, dim=0, weights=valid_episodes.mask)
                losses.append(loss)

                mask = valid_episodes.mask
                if valid_episodes.actions.dim() > 2:
                    mask = mask.unsqueeze(2)
                kl = weighted_mean(kl_divergence(pi, old_pi),
                                   dim=0,
                                   weights=mask)
                kls.append(kl)

        return (torch.mean(torch.stack(losses, dim=0)),
                torch.mean(torch.stack(kls, dim=0)), pis)
示例#3
0
    def surrogate_loss(self, episodes, old_pis=None):
        losses, kls, action_dists, critic_losses = [], [], [], []
        if old_pis is None:
            old_pis = [None] * len(episodes)

        for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis):
            policy_params, critic_params = self.adapt(train_episodes)
            with torch.set_grad_enabled(old_pi is None):
                action_dist = self.policy(valid_episodes.observations,
                                          params=policy_params)
                action_dists.append(detach_distribution(action_dist))

                if old_pi is None:
                    old_pi = detach_distribution(action_dist)

                values = self.critic(valid_episodes.observations,
                                     params=critic_params)
                advantages = valid_episodes.gae(values, tau=self.tau)
                value_loss = weighted_mean(advantages.pow(2),
                                           dim=0,
                                           weights=valid_episodes.mask)
                critic_losses.append(value_loss)
                advantages = weighted_normalize(advantages,
                                                weights=valid_episodes.mask,
                                                epsilon=1e-5)

                log_ratio = (action_dist.log_prob(valid_episodes.actions) -
                             old_pi.log_prob(valid_episodes.actions))
                if log_ratio.dim() > 2:
                    log_ratio = torch.sum(log_ratio, dim=2)
                ratio = torch.exp(log_ratio)

                loss = -weighted_mean(ratio * advantages.detach(),
                                      dim=0,
                                      weights=valid_episodes.mask)
                losses.append(loss)

                mask = valid_episodes.mask
                if valid_episodes.actions.dim() > 2:
                    mask = mask.unsqueeze(2)
                kl = weighted_mean(kl_divergence(action_dist, old_pi),
                                   dim=0,
                                   weights=mask)
                kls.append(kl)

        return (torch.mean(torch.stack(losses, dim=0)),
                torch.mean(torch.stack(kls, dim=0)), action_dists,
                torch.mean(torch.stack(critic_losses, dim=0)))
示例#4
0
    async def surrogate_loss(self, train_futures, valid_futures, old_pi=None):
        first_order = (old_pi is not None) or self.first_order
        # 暂停协程函数,等待协程函数 adapt() 运行结束并输出返回值
        # 要先在此处暂停函数
        params = await self.adapt(train_futures, first_order=first_order)
        """
        要等到上面的 train_futures 进行完之后,再往下进行
        每一个 train_futures 要对应一个 valid_futures,每一对是并行分开运行的
        future 的数量就是 每一个 batch 中 tasks 的数量
        """
        with torch.set_grad_enabled(old_pi is None):
            # 暂停协程函数,等待协程对象 valid_futures 运行结束并输出返回值
            valid_episodes = await valid_futures
            pi = self.policy(valid_episodes.observations, params=params)

            if old_pi is None:
                old_pi = detach_distribution(pi)

            log_ratio = (pi.log_prob(valid_episodes.actions) -
                         old_pi.log_prob(valid_episodes.actions))
            ratio = torch.exp(log_ratio)

            losses = -weighted_mean(ratio * valid_episodes.advantages,
                                    lengths=valid_episodes.lengths)
            kls = weighted_mean(kl_divergence(pi, old_pi),
                                lengths=valid_episodes.lengths)

        return losses.mean(), kls.mean(), old_pi
示例#5
0
    async def surrogate_loss(self,
                             train_futures,
                             valid_futures,
                             old_pi=None,
                             args=None,
                             inner=None):
        first_order = (old_pi is not None) or self.first_order
        params = await self.adapt(train_futures,
                                  first_order=first_order,
                                  args=args,
                                  inner=inner)

        with torch.set_grad_enabled(old_pi is None):
            valid_episodes = await valid_futures
            pi = self.policy(valid_episodes.observations, params=params)

            if old_pi is None:
                old_pi = detach_distribution(pi)

            log_ratio = (pi.log_prob(valid_episodes.actions) -
                         old_pi.log_prob(valid_episodes.actions))
            ratio = torch.exp(log_ratio)

            losses = -weighted_mean(ratio * valid_episodes.advantages,
                                    lengths=valid_episodes.lengths)
            kls = weighted_mean(kl_divergence(pi, old_pi),
                                lengths=valid_episodes.lengths)

        return losses.mean(), kls.mean(), old_pi
    def kl_divergence(self, episodes, old_pis=None):
        kls = []
        if old_pis is None:
            old_pis = [None] * len(episodes)

        for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis):
            self.logger.info("in kl divergence")
            if self.usePPO:
                params, grad_norm = self.adapt_ppo(train_episodes)
            else:
                params = self.adapt(train_episodes)
                grad_norm = []
            #if self.baseline_type = 'critic shared':
            #  pi,_ = self.policy(valid_episodes.obervations,params=params)
            pi = self.policy(valid_episodes.observations, params=params)

            if old_pi is None:
                old_pi = detach_distribution(pi)

            mask = valid_episodes.mask
            if valid_episodes.actions.dim() > 2:
                mask = mask.unsqueeze(2)
            kl = weighted_mean(kl_divergence(pi, old_pi), dim=0, weights=mask)
            kls.append(kl)
        self.logger.info("kl:")
        self.logger.info(kls)
        self.logger.info("grad_norm:")
        self.logger.info(grad_norm)
        #pdb.set_trace()
        return torch.mean(torch.stack(kls, dim=0))
示例#7
0
 def kl_divergence_ng(self, episodes):
     # episode is the train episode
     pi = self.policy(episodes.observations)
     pi_detach = detach_distribution(pi)
     mask = episodes.mask
     if episodes.actions.dim() > 2:
         mask = mask.unsqueeze(2)
     kl = weighted_mean(kl_divergence(pi_detach, pi), dim=0, weights=mask)
     return kl
    def surrogate_loss(self, episodes, old_pis=None):
        losses, kls, pis = [], [], []
        if old_pis is None:
            old_pis = [None] * len(episodes)

        for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis):
            params = self.adapt(train_episodes)
            with torch.set_grad_enabled(True):
                pi = self.policy(valid_episodes.observations, params=params)
                pis.append(detach_distribution(pi))

                if old_pi is None:
                    old_pi = detach_distribution(pi)

                values = self.baseline(valid_episodes)
                advantages = valid_episodes.gae(values, tau=self.tau)
                advantages = weighted_normalize(advantages,
                                                weights=valid_episodes.mask)

                log_ratio = (pi.log_prob(valid_episodes.actions) -
                             old_pi.log_prob(valid_episodes.actions))
                if log_ratio.dim() > 2:
                    log_ratio = torch.sum(log_ratio, dim=2)
                ratio = torch.exp(log_ratio)

                loss_clipped = ratio.clamp(1.0 - self.ppo_ratio,
                                           1.0 + self.ppo_ratio) * advantages
                loss = ratio * advantages

                loss = -torch.min(loss, loss_clipped)

                loss = weighted_mean(loss, dim=0, weights=valid_episodes.mask)

                mask = valid_episodes.mask
                if valid_episodes.actions.dim() > 2:
                    mask = mask.unsqueeze(2)
                kl = weighted_mean(kl_divergence(old_pi, pi),
                                   dim=0,
                                   weights=mask)
                kls.append(kl)
                losses.append(loss + kl * 0.0005)

        return (torch.mean(torch.stack(losses, dim=0)),
                torch.mean(torch.stack(kls, dim=0)), pis)
    def surrogate_loss(self, episodes, old_pis=None):
        losses, kls, pis = [], [], []
        if old_pis is None:
            old_pis = [None] * len(episodes)

        for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis):
            params = self.adapt(train_episodes)
            with torch.set_grad_enabled(old_pi is None):
                pi = self.policy(valid_episodes.observations, params=params)
                # detach the mu, scale parameters of distribution pi, no gradients, no update to them
                pis.append(detach_distribution(pi))

                if old_pi is None:
                    old_pi = detach_distribution(pi)

                values = self.baseline(valid_episodes)
                advantages = valid_episodes.gae(values, tau=self.tau)
                advantages = weighted_normalize(advantages,
                                                weights=valid_episodes.mask)
                # initial 0, changed in line search process as pi changed, old_pi not changed
                log_ratio = (pi.log_prob(valid_episodes.actions) -
                             old_pi.log_prob(valid_episodes.actions))
                # print('log_ratio: ',log_ratio)
                if log_ratio.dim() > 2:
                    log_ratio = torch.sum(log_ratio, dim=2)
                ratio = torch.exp(log_ratio)
                # print('ratio: ', ratio)
                # print('advantages: ', advantages)
                loss = -weighted_mean(
                    ratio * advantages, dim=0, weights=valid_episodes.mask)
                # the weighted_mean loss is very samll, e-8 magnitude
                print('loss: ', loss)
                losses.append(loss)

                mask = valid_episodes.mask
                if valid_episodes.actions.dim() > 2:
                    mask = mask.unsqueeze(2)
                kl = weighted_mean(kl_divergence(pi, old_pi),
                                   dim=0,
                                   weights=mask)
                kls.append(kl)

        return (torch.mean(torch.stack(losses, dim=0)),
                torch.mean(torch.stack(kls, dim=0)), pis)
示例#10
0
    def surrogate_loss(self, episodes, old_pis=None):
        """Computes the surrogate loss in TRPO:
        (pi(a|s) / q(a|s)) * Q(s,a) in Eqn 14
        Because the meta-loss tried to find theta that minimizes
        loss with phi, the loss is computed with valid episodes  
        """
        losses, kls, pis = [], [], []
        if old_pis is None:
            old_pis = [None] * len(episodes)

        for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis):
            params = self.adapt(train_episodes)

            with torch.set_grad_enabled(old_pi is None):
                pi = self.policy(valid_episodes.observations, params=params)
                pis.append(detach_distribution(pi))

                if old_pi is None:
                    old_pi = detach_distribution(pi)

                values = self.baseline(valid_episodes)
                advantages = valid_episodes.gae(values, tau=self.tau)
                advantages = weighted_normalize(advantages, weights=valid_episodes.mask)

                log_ratio = (pi.log_prob(valid_episodes.actions) - old_pi.log_prob(valid_episodes.actions))
                if log_ratio.dim() > 2:
                    log_ratio = torch.sum(log_ratio, dim=2)
                ratio = torch.exp(log_ratio)  # Convert back to ratio from log

                loss = -weighted_mean(ratio * advantages, dim=0, weights=valid_episodes.mask)
                losses.append(loss)

                mask = valid_episodes.mask
                if valid_episodes.actions.dim() > 2:
                    mask = mask.unsqueeze(2)
                kl = weighted_mean(kl_divergence(pi, old_pi), dim=0, weights=mask)
                kls.append(kl)

        return (
            torch.mean(torch.stack(losses, dim=0)),
            torch.mean(torch.stack(kls, dim=0)), 
            pis)
    def kl_divergence(self, episodes, old_pis=None):
        kls = []
        if old_pis is None:
            old_pis = [None] * len(episodes)

        for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis):
            params = self.adapt(train_episodes)
            pi = self.policy(valid_episodes.observations, params=params)

            if old_pi is None:
                old_pi = detach_distribution(pi)

            mask = valid_episodes.mask
            if valid_episodes.actions.dim() > 2:
                mask = mask.unsqueeze(2)
            kl = weighted_mean(kl_divergence(pi, old_pi), dim=0, weights=mask)
            kls.append(kl)

        return torch.mean(torch.stack(kls, dim=0))
示例#12
0
    def kl_divergence(self, episodes, old_pis=None):
        """In Trust Region Policy Optimization (TRPO, [4]), the heuristic
        approximation which considers the "average" KL divergence is used instead
        """
        kls = []
        if old_pis is None:
            old_pis = [None] * len(episodes)

        for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis):
            params = self.adapt(train_episodes)
            pi = self.policy(valid_episodes.observations, params=params)

            if old_pi is None:
                old_pi = detach_distribution(pi)

            mask = valid_episodes.mask
            if valid_episodes.actions.dim() > 2:
                mask = mask.unsqueeze(2)
            kl = weighted_mean(kl_divergence(pi, old_pi), dim=0, weights=mask)
            kls.append(kl)

        return torch.mean(torch.stack(kls, dim=0))
    def step(self,
             episodes,
             max_kl=1e-3,
             cg_iters=10,
             cg_damping=1e-2,
             ls_max_steps=10,
             ls_backtrack_ratio=0.5):
        """Meta-optimization step (ie. update of the initial parameters), based 
        on Trust Region Policy Optimization (TRPO, [4]).
        """
        old_pis = []
        for train_episodes, valid_episodes in episodes:
            params = self.adapt(train_episodes)
            pi = self.policy(valid_episodes.observations, params=params)
            old_pis.append(detach_distribution(pi))

        for _ in range(self.optimization_epochs):
            self.optimizer.zero_grad()
            old_loss, _, _ = self.surrogate_loss(episodes, old_pis=old_pis)
            old_loss.backward()
            # torch.nn.utils.clip_grad_norm_(self.policy.parameters(), self.gradient_clip)
            self.optimizer.step()
示例#14
0
    def surrogate_loss(self, episodes, old_pis=None):
        """
        Surrogate objective:
        E_r SmoothReLU( V_r^{adapted self.policy} - \max_{\pi \in self.policies[0:policy_idx - 1} V_r^\pi)

        V_r^{adapted self.policy} can be evaluated by valid_episodes in episodes
        \max_{\pi \in self.policies[0:policy_idx - 1} V_r^\pi is computed in self.values_of_optimized_policies

        :param episodes: [(episodes before adapting, episodes after adapting) for task in sampled tasks]
        :param old_pis: dummy parameter derived from super
        :return: mean of losses, mean of kls, pis
        """
        losses, kls, pis = [], [], []
        if old_pis is None:
            old_pis = [None] * len(episodes)

        for episode_index in range(len(episodes)):
            (train_episodes, valid_episodes) = episodes[episode_index]
            old_pi = old_pis[episode_index]

            if self.current_policy_idx == 0:
                dominance_correction = 1
            else:
                difference_from_best_value = total_rewards(
                    valid_episodes.rewards
                ) - self.values_of_optimized_policies[episode_index]
                dominance_correction = 1 - 1 / (
                    1 + math.exp(difference_from_best_value))

            params = self.adapt(train_episodes)
            with torch.set_grad_enabled(old_pi is None):
                pi = self.policy(valid_episodes.observations, params=params)
                pis.append(detach_distribution(pi))

                if old_pi is None:
                    old_pi = detach_distribution(pi)

                values = self.baseline(valid_episodes)
                advantages = valid_episodes.gae(values, tau=self.tau)
                advantages = weighted_normalize(advantages,
                                                weights=valid_episodes.mask)

                log_ratio = (pi.log_prob(valid_episodes.actions) -
                             old_pi.log_prob(valid_episodes.actions))
                if log_ratio.dim() > 2:
                    log_ratio = torch.sum(log_ratio, dim=2)
                ratio = torch.exp(log_ratio)

                loss = -dominance_correction * weighted_mean(
                    ratio * advantages, dim=0, weights=valid_episodes.mask)
                losses.append(loss)

                mask = valid_episodes.mask
                if valid_episodes.actions.dim() > 2:
                    mask = mask.unsqueeze(2)
                kl = weighted_mean(kl_divergence(pi, old_pi),
                                   dim=0,
                                   weights=mask)
                kls.append(kl)

        if len(losses) == 0 or len(kls) == 0:
            # signal outside that no losses. avoiding taking mean of empty tensors..
            return (None, None, pis)
        else:
            return (torch.mean(torch.stack(losses, dim=0)),
                    torch.mean(torch.stack(kls, dim=0)), pis)
示例#15
0
    def compute_ng_gradient(self,
                            episodes,
                            max_kl=1e-3,
                            cg_iters=20,
                            cg_damping=1e-2,
                            ls_max_steps=10,
                            ls_backtrack_ratio=0.5):
        ng_grads = []
        for train_episodes, valid_episodes in episodes:
            params, step_size, step = self.adapt(train_episodes)

            # compute $grad = \nabla_x J^{lvc}(x) at x = \theta - \eta\UM(\theta)
            pi = self.policy(valid_episodes.observations, params=params)
            pi_detach = detach_distribution(pi)

            values = self.baseline(valid_episodes)
            advantages = valid_episodes.gae(values, tau=self.tau)
            advantages = weighted_normalize(advantages,
                                            weights=valid_episodes.mask)

            log_ratio = pi.log_prob(
                valid_episodes.actions) - pi_detach.log_prob(
                    valid_episodes.actions)
            if log_ratio.dim() > 2:
                log_ratio = torch.sum(log_ratio, dim=2)
            ratio = torch.exp(log_ratio)

            loss = -weighted_mean(
                ratio * advantages, dim=0, weights=valid_episodes.mask)

            ng_grad_0 = torch.autograd.grad(
                loss, self.policy.parameters())  # no create graph
            ng_grad_0 = parameters_to_vector(ng_grad_0)
            # compute the inverse of Fihser matrix at x=\theta times $grad with Conjugate Gradient
            hessian_vector_product = self.hessian_vector_product_ng(
                train_episodes, damping=cg_damping)
            F_inv_grad = conjugate_gradient(hessian_vector_product,
                                            ng_grad_0,
                                            cg_iters=cg_iters)

            # compute $ng_grad_1 = \nabla^2 J^{lvc}(x) at x = \theta times $F_inv_grad
            # create graph for higher differential
            # self.baseline.fit(train_episodes)
            loss = self.inner_loss(train_episodes)
            grad = torch.autograd.grad(loss,
                                       self.policy.parameters(),
                                       create_graph=True)
            grad = parameters_to_vector(grad)
            grad_F_inv_grad = torch.dot(grad, F_inv_grad.detach())
            ng_grad_1 = torch.autograd.grad(grad_F_inv_grad,
                                            self.policy.parameters())
            ng_grad_1 = parameters_to_vector(ng_grad_1)
            # compute $ng_grad_2 = the Jocobian of {F(x) U(\theta)} at x = \theta times $F_inv_grad
            hessian_vector_product = self.hessian_vector_product_ng(
                train_episodes, damping=cg_damping)
            F_U = hessian_vector_product(step)
            ng_grad_2 = torch.autograd.grad(
                torch.dot(F_U, F_inv_grad.detach()), self.policy.parameters())
            ng_grad_2 = parameters_to_vector(ng_grad_2)
            ng_grad = ng_grad_0 - step_size * (ng_grad_1 + ng_grad_2)

            ng_grad = parameters_to_vector(ng_grad)
            ng_grads.append(ng_grad.view(len(ng_grad), 1))

        return torch.mean(torch.stack(ng_grads, dim=1), dim=[1, 2])