Exemplo n.º 1
0
    def loss(self, episodes, inds=None):
        """
        PPO Surrogate Loss
        """
        log_ratios, advantages, values, entropy = self._forward_policy(
            episodes, ratio=True)

        # clipped pg loss
        ratio = torch.exp(log_ratios)
        pg_loss1 = -advantages * ratio
        pg_loss2 = -advantages * torch.clamp(
            ratio, min=1.0 - self.clip_frac, max=1.0 + self.clip_frac)

        # clipped value loss
        values_clipped = episodes.old_values + torch.clamp(
            values.squeeze() - episodes.old_values,
            min=-self.clip_frac,
            max=self.clip_frac)
        vf_loss1 = (values.squeeze() - episodes.returns)**2
        vf_loss2 = (values_clipped - episodes.returns)**2

        if inds is None:
            inds = np.arange(self.num_workers)

        masks = episodes.mask[:, inds]
        pg_loss = weighted_mean(torch.max(pg_loss1, pg_loss2)[:, inds],
                                dim=0,
                                weights=masks)
        vf_loss = 0.5 * weighted_mean(
            torch.max(vf_loss1, vf_loss2)[:, inds], dim=0, weights=masks)
        entropy_loss = weighted_mean(entropy[:, inds], dim=0, weights=masks)
        return pg_loss + self.vf_coef * vf_loss - self.ent_coef * entropy_loss
Exemplo n.º 2
0
    async def surrogate_loss(self,
                             train_futures,
                             valid_futures,
                             old_pi=None,
                             args=None,
                             inner=None):
        first_order = (old_pi is not None) or self.first_order
        params = await self.adapt(train_futures,
                                  first_order=first_order,
                                  args=args,
                                  inner=inner)

        with torch.set_grad_enabled(old_pi is None):
            valid_episodes = await valid_futures
            pi = self.policy(valid_episodes.observations, params=params)

            if old_pi is None:
                old_pi = detach_distribution(pi)

            log_ratio = (pi.log_prob(valid_episodes.actions) -
                         old_pi.log_prob(valid_episodes.actions))
            ratio = torch.exp(log_ratio)

            losses = -weighted_mean(ratio * valid_episodes.advantages,
                                    lengths=valid_episodes.lengths)
            kls = weighted_mean(kl_divergence(pi, old_pi),
                                lengths=valid_episodes.lengths)

        return losses.mean(), kls.mean(), old_pi
Exemplo n.º 3
0
    async def surrogate_loss(self, train_futures, valid_futures, old_pi=None):
        first_order = (old_pi is not None) or self.first_order
        # 暂停协程函数,等待协程函数 adapt() 运行结束并输出返回值
        # 要先在此处暂停函数
        params = await self.adapt(train_futures, first_order=first_order)
        """
        要等到上面的 train_futures 进行完之后,再往下进行
        每一个 train_futures 要对应一个 valid_futures,每一对是并行分开运行的
        future 的数量就是 每一个 batch 中 tasks 的数量
        """
        with torch.set_grad_enabled(old_pi is None):
            # 暂停协程函数,等待协程对象 valid_futures 运行结束并输出返回值
            valid_episodes = await valid_futures
            pi = self.policy(valid_episodes.observations, params=params)

            if old_pi is None:
                old_pi = detach_distribution(pi)

            log_ratio = (pi.log_prob(valid_episodes.actions) -
                         old_pi.log_prob(valid_episodes.actions))
            ratio = torch.exp(log_ratio)

            losses = -weighted_mean(ratio * valid_episodes.advantages,
                                    lengths=valid_episodes.lengths)
            kls = weighted_mean(kl_divergence(pi, old_pi),
                                lengths=valid_episodes.lengths)

        return losses.mean(), kls.mean(), old_pi
Exemplo n.º 4
0
    def loss(self, episodes):
        """
        REINFORCE gradient with baseline [2], computed on advantages estimated 
        with Generalized Advantage Estimation (GAE, [3]).
        """
        log_probs, advantages, values, entropy = self._forward_policy(episodes)

        pg_loss = -weighted_mean(log_probs * advantages, dim=0, weights=episodes.mask)
        vf_loss = 0.5 * weighted_mean((values.squeeze() - episodes.returns) ** 2, dim=0, weights=episodes.mask)
        entropy_loss = weighted_mean(entropy, dim=0, weights=episodes.mask)
        return pg_loss + self.vf_coef * vf_loss - self.ent_coef * entropy_loss
Exemplo n.º 5
0
    def surrogate_loss(self, episodes, old_pis=None):
        losses, kls, pis = [], [], []
        if old_pis is None:
            old_pis = [None] * len(episodes)

        for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis):
            if self.usePPO:
                params, grad_norm = self.adapt_ppo(train_episodes)
            else:
                params = self.adapt(train_episodes)
            self.logger.info("in surrogate_loss")
            with torch.set_grad_enabled(old_pi is None):
                if self.baseline_type == 'critic shared':
                    pi, _ = self.policy(valid_episodes.observations,
                                        params=params)
                pi = self.policy(valid_episodes.observations, params=params)
                pis.append(detach_distribution(pi))

                if old_pi is None:
                    old_pi = detach_distribution(pi)

                if self.baseline_type == 'linear':
                    values = self.baseline(valid_episodes)
                elif self.baseline_type == 'critic separate':
                    values = self.baseline(valid_episodes.observations)
                elif self.baseline_type == 'critic shared':
                    _, values = self.policy(valid_episodes.observations,
                                            params=params)

                advantages = valid_episodes.gae(values, tau=self.tau)
                advantages = weighted_normalize(advantages,
                                                weights=valid_episodes.mask)

                log_ratio = (pi.log_prob(valid_episodes.actions) -
                             old_pi.log_prob(valid_episodes.actions))
                if log_ratio.dim() > 2:
                    log_ratio = torch.sum(log_ratio, dim=2)
                ratio = torch.exp(log_ratio)

                loss = -weighted_mean(
                    ratio * advantages, dim=0, weights=valid_episodes.mask)
                losses.append(loss)

                mask = valid_episodes.mask
                if valid_episodes.actions.dim() > 2:
                    mask = mask.unsqueeze(2)
                kl = weighted_mean(kl_divergence(pi, old_pi),
                                   dim=0,
                                   weights=mask)
                kls.append(kl)

        return (torch.mean(torch.stack(losses, dim=0)),
                torch.mean(torch.stack(kls, dim=0)), pis)
Exemplo n.º 6
0
    def surrogate_loss(self, episodes, old_pis=None):
        """
        Using TRPO.

        old_pis are not None only when doing line search?
        How are old_pis used? Like the behavior policy in TRPO? How?
        """
        losses, kls, pis = [], [], []
        if old_pis is None:
            old_pis = [None] * len(episodes)

        for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis):
            # adapt our policy network to a new task
            params = self.adapt(train_episodes)
            # doing learning only when old_pi is None?
            with torch.set_grad_enabled(old_pi is None):
                pi = self.policy(valid_episodes.observations, params=params)
                # the set of policies adapted to each task
                pis.append(detach_distribution(pi))

                if old_pi is None:
                    old_pi = detach_distribution(pi)

                values = self.baseline(valid_episodes)
                advantages = valid_episodes.gae(values, tau=self.tau)
                advantages = weighted_normalize(advantages,
                                                weights=valid_episodes.mask)

                log_ratio = (pi.log_prob(valid_episodes.actions) -
                             old_pi.log_prob(valid_episodes.actions))
                if log_ratio.dim() > 2:
                    log_ratio = torch.sum(log_ratio, dim=2)
                ratio = torch.exp(log_ratio)

                loss = -weighted_mean(
                    ratio * advantages, dim=0, weights=valid_episodes.mask)
                losses.append(loss)

                mask = valid_episodes.mask
                if valid_episodes.actions.dim() > 2:
                    mask = mask.unsqueeze(2)
                kl = weighted_mean(kl_divergence(pi, old_pi),
                                   dim=0,
                                   weights=mask)
                kls.append(kl)

        return (torch.mean(torch.stack(losses, dim=0)),
                torch.mean(torch.stack(kls, dim=0)), pis)
Exemplo n.º 7
0
    def surrogate_loss(self, episodes, old_pis=None):
        losses, kls, action_dists, critic_losses = [], [], [], []
        if old_pis is None:
            old_pis = [None] * len(episodes)

        for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis):
            policy_params, critic_params = self.adapt(train_episodes)
            with torch.set_grad_enabled(old_pi is None):
                action_dist = self.policy(valid_episodes.observations,
                                          params=policy_params)
                action_dists.append(detach_distribution(action_dist))

                if old_pi is None:
                    old_pi = detach_distribution(action_dist)

                values = self.critic(valid_episodes.observations,
                                     params=critic_params)
                advantages = valid_episodes.gae(values, tau=self.tau)
                value_loss = weighted_mean(advantages.pow(2),
                                           dim=0,
                                           weights=valid_episodes.mask)
                critic_losses.append(value_loss)
                advantages = weighted_normalize(advantages,
                                                weights=valid_episodes.mask,
                                                epsilon=1e-5)

                log_ratio = (action_dist.log_prob(valid_episodes.actions) -
                             old_pi.log_prob(valid_episodes.actions))
                if log_ratio.dim() > 2:
                    log_ratio = torch.sum(log_ratio, dim=2)
                ratio = torch.exp(log_ratio)

                loss = -weighted_mean(ratio * advantages.detach(),
                                      dim=0,
                                      weights=valid_episodes.mask)
                losses.append(loss)

                mask = valid_episodes.mask
                if valid_episodes.actions.dim() > 2:
                    mask = mask.unsqueeze(2)
                kl = weighted_mean(kl_divergence(action_dist, old_pi),
                                   dim=0,
                                   weights=mask)
                kls.append(kl)

        return (torch.mean(torch.stack(losses, dim=0)),
                torch.mean(torch.stack(kls, dim=0)), action_dists,
                torch.mean(torch.stack(critic_losses, dim=0)))
Exemplo n.º 8
0
    def kl_divergence(self, episodes, old_pis=None):
        kls = []
        if old_pis is None:
            old_pis = [None] * len(episodes)

        for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis):
            self.logger.info("in kl divergence")
            if self.usePPO:
                params, grad_norm = self.adapt_ppo(train_episodes)
            else:
                params = self.adapt(train_episodes)
                grad_norm = []
            #if self.baseline_type = 'critic shared':
            #  pi,_ = self.policy(valid_episodes.obervations,params=params)
            pi = self.policy(valid_episodes.observations, params=params)

            if old_pi is None:
                old_pi = detach_distribution(pi)

            mask = valid_episodes.mask
            if valid_episodes.actions.dim() > 2:
                mask = mask.unsqueeze(2)
            kl = weighted_mean(kl_divergence(pi, old_pi), dim=0, weights=mask)
            kls.append(kl)
        self.logger.info("kl:")
        self.logger.info(kls)
        self.logger.info("grad_norm:")
        self.logger.info(grad_norm)
        #pdb.set_trace()
        return torch.mean(torch.stack(kls, dim=0))
Exemplo n.º 9
0
 def inner_critic_loss(self, episodes, params=None):
     values = self.critic(episodes.observations)
     advantages = episodes.gae(values, tau=self.tau)
     value_loss = weighted_mean(advantages.pow(2),
                                dim=0,
                                weights=episodes.mask)
     return value_loss
Exemplo n.º 10
0
 def kl_divergence_ng(self, episodes):
     # episode is the train episode
     pi = self.policy(episodes.observations)
     pi_detach = detach_distribution(pi)
     mask = episodes.mask
     if episodes.actions.dim() > 2:
         mask = mask.unsqueeze(2)
     kl = weighted_mean(kl_divergence(pi_detach, pi), dim=0, weights=mask)
     return kl
    def surrogate_loss(self, episodes, old_pis=None):
        losses, kls, pis = [], [], []
        if old_pis is None:
            old_pis = [None] * len(episodes)

        for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis):
            params = self.adapt(train_episodes)
            with torch.set_grad_enabled(True):
                pi = self.policy(valid_episodes.observations, params=params)
                pis.append(detach_distribution(pi))

                if old_pi is None:
                    old_pi = detach_distribution(pi)

                values = self.baseline(valid_episodes)
                advantages = valid_episodes.gae(values, tau=self.tau)
                advantages = weighted_normalize(advantages,
                                                weights=valid_episodes.mask)

                log_ratio = (pi.log_prob(valid_episodes.actions) -
                             old_pi.log_prob(valid_episodes.actions))
                if log_ratio.dim() > 2:
                    log_ratio = torch.sum(log_ratio, dim=2)
                ratio = torch.exp(log_ratio)

                loss_clipped = ratio.clamp(1.0 - self.ppo_ratio,
                                           1.0 + self.ppo_ratio) * advantages
                loss = ratio * advantages

                loss = -torch.min(loss, loss_clipped)

                loss = weighted_mean(loss, dim=0, weights=valid_episodes.mask)

                mask = valid_episodes.mask
                if valid_episodes.actions.dim() > 2:
                    mask = mask.unsqueeze(2)
                kl = weighted_mean(kl_divergence(old_pi, pi),
                                   dim=0,
                                   weights=mask)
                kls.append(kl)
                losses.append(loss + kl * 0.0005)

        return (torch.mean(torch.stack(losses, dim=0)),
                torch.mean(torch.stack(kls, dim=0)), pis)
    def surrogate_loss(self, episodes, old_pis=None):
        losses, kls, pis = [], [], []
        if old_pis is None:
            old_pis = [None] * len(episodes)

        for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis):
            params = self.adapt(train_episodes)
            with torch.set_grad_enabled(old_pi is None):
                pi = self.policy(valid_episodes.observations, params=params)
                # detach the mu, scale parameters of distribution pi, no gradients, no update to them
                pis.append(detach_distribution(pi))

                if old_pi is None:
                    old_pi = detach_distribution(pi)

                values = self.baseline(valid_episodes)
                advantages = valid_episodes.gae(values, tau=self.tau)
                advantages = weighted_normalize(advantages,
                                                weights=valid_episodes.mask)
                # initial 0, changed in line search process as pi changed, old_pi not changed
                log_ratio = (pi.log_prob(valid_episodes.actions) -
                             old_pi.log_prob(valid_episodes.actions))
                # print('log_ratio: ',log_ratio)
                if log_ratio.dim() > 2:
                    log_ratio = torch.sum(log_ratio, dim=2)
                ratio = torch.exp(log_ratio)
                # print('ratio: ', ratio)
                # print('advantages: ', advantages)
                loss = -weighted_mean(
                    ratio * advantages, dim=0, weights=valid_episodes.mask)
                # the weighted_mean loss is very samll, e-8 magnitude
                print('loss: ', loss)
                losses.append(loss)

                mask = valid_episodes.mask
                if valid_episodes.actions.dim() > 2:
                    mask = mask.unsqueeze(2)
                kl = weighted_mean(kl_divergence(pi, old_pi),
                                   dim=0,
                                   weights=mask)
                kls.append(kl)

        return (torch.mean(torch.stack(losses, dim=0)),
                torch.mean(torch.stack(kls, dim=0)), pis)
Exemplo n.º 13
0
    def surrogate_loss(self, episodes, old_pis=None):
        """Computes the surrogate loss in TRPO:
        (pi(a|s) / q(a|s)) * Q(s,a) in Eqn 14
        Because the meta-loss tried to find theta that minimizes
        loss with phi, the loss is computed with valid episodes  
        """
        losses, kls, pis = [], [], []
        if old_pis is None:
            old_pis = [None] * len(episodes)

        for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis):
            params = self.adapt(train_episodes)

            with torch.set_grad_enabled(old_pi is None):
                pi = self.policy(valid_episodes.observations, params=params)
                pis.append(detach_distribution(pi))

                if old_pi is None:
                    old_pi = detach_distribution(pi)

                values = self.baseline(valid_episodes)
                advantages = valid_episodes.gae(values, tau=self.tau)
                advantages = weighted_normalize(advantages, weights=valid_episodes.mask)

                log_ratio = (pi.log_prob(valid_episodes.actions) - old_pi.log_prob(valid_episodes.actions))
                if log_ratio.dim() > 2:
                    log_ratio = torch.sum(log_ratio, dim=2)
                ratio = torch.exp(log_ratio)  # Convert back to ratio from log

                loss = -weighted_mean(ratio * advantages, dim=0, weights=valid_episodes.mask)
                losses.append(loss)

                mask = valid_episodes.mask
                if valid_episodes.actions.dim() > 2:
                    mask = mask.unsqueeze(2)
                kl = weighted_mean(kl_divergence(pi, old_pi), dim=0, weights=mask)
                kls.append(kl)

        return (
            torch.mean(torch.stack(losses, dim=0)),
            torch.mean(torch.stack(kls, dim=0)), 
            pis)
Exemplo n.º 14
0
def reinforce_loss(policy, episodes, params=None):
    pi = policy(episodes.observations.view((-1, *episodes.observation_shape)),
                params=params)

    log_probs = pi.log_prob(episodes.actions.view((-1, *episodes.action_shape)))
    log_probs = log_probs.view(len(episodes), episodes.batch_size)

    losses = -weighted_mean(log_probs * episodes.advantages,
                            lengths=episodes.lengths)

    return losses.mean()
Exemplo n.º 15
0
def val(args, sampler_val, policy, baseline, batch):
    start_time = time.time()

    from maml_rl.utils.torch_utils import weighted_normalize, weighted_mean
    tasks_val = sampler_val.sample_tasks()
    task_to_episodes = dict()
    for task in tasks_val:
        task_episodes = []
        sampler_val.reset_task(task)
        for i_episode in range(args.num_adapt_val + 1):
            if i_episode == 0:
                params = None
            episodes = sampler_val.sample(policy,
                                          params=params,
                                          gamma=args.gamma,
                                          device=args.device)

            # compute inner loss
            baseline.fit(episodes)
            values = baseline(episodes)
            advantages = episodes.gae(values, tau=args.tau)
            advantages = weighted_normalize(advantages, weights=episodes.mask)

            pi = policy(episodes.observations, params=params)
            log_probs = pi.log_prob(episodes.actions)
            if log_probs.dim() > 2:
                log_probs = torch.sum(log_probs, dim=2)
            entropy = pi.entropy().mean()
            loss = -weighted_mean(
                log_probs * advantages, dim=0,
                weights=episodes.mask) - args.entropy_coef_val * entropy
            fast_lr = args.fast_lr if i_episode == 0 else args.fast_lr_val_after_one
            if i_episode <= args.num_adapt_val:
                params = policy.update_params(loss,
                                              step_size=fast_lr,
                                              first_order=True)
            task_episodes.append(episodes)
        task_to_episodes[str(task)] = task_episodes

    for i_episode in range(args.num_adapt_val + 1):
        returns = calculate_returns([
            task_episodes[i_episode].rewards
            for task_episodes in task_to_episodes.values()
        ])
        logger.logkv(f'val_return_avg_adapt{i_episode}', returns.mean().item())
        logger.logkv(f'val_return_std_adapt{i_episode}', returns.std().item())

    logger.logkv('val_time', time.time() - start_time)

    save_dir = os.path.join(args.log_dir, 'val')
    os.makedirs(save_dir, exist_ok=True)
    pickle.dump(task_to_episodes,
                open(os.path.join(save_dir, f'val_{batch}.pkl'), 'wb'))
Exemplo n.º 16
0
def test_weighted_mean_side_effect():
    lengths = [2, 3, 7, 5, 11]
    # Inputs
    inputs_np = np.random.rand(13, 5).astype(np.float32)

    # Pytorch
    inputs_th = torch.as_tensor(inputs_np)
    mean_th = weighted_mean(inputs_th, lengths=lengths)

    for i, length in enumerate(lengths):
        assert (inputs_th[length:, i] == 0.).all()
        assert (inputs_np[length:, i] == 0.).all()
    def inner_loss(self, episodes, params=None):
        """Compute the inner loss for the one-step gradient update. The inner 
        loss is REINFORCE with baseline [2], computed on advantages estimated 
        with Generalized Advantage Estimation (GAE, [3]).
        """
        values = self.baseline(episodes)
        advantages = episodes.gae(values, tau=self.tau)
        advantages = weighted_normalize(advantages, weights=episodes.mask)

        pi = self.policy(episodes.observations, params=params)
        log_probs = pi.log_prob(episodes.actions)
        if log_probs.dim() > 2:
            log_probs = torch.sum(log_probs, dim=2)
        loss = -weighted_mean(log_probs * advantages, dim=0)

        return loss
    def kl_divergence(self, episodes, old_pis=None):
        kls = []
        if old_pis is None:
            old_pis = [None] * len(episodes)

        for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis):
            params = self.adapt(train_episodes)
            pi = self.policy(valid_episodes.observations, params=params)

            if old_pi is None:
                old_pi = detach_distribution(pi)

            mask = valid_episodes.mask
            if valid_episodes.actions.dim() > 2:
                mask = mask.unsqueeze(2)
            kl = weighted_mean(kl_divergence(pi, old_pi), dim=0, weights=mask)
            kls.append(kl)

        return torch.mean(torch.stack(kls, dim=0))
Exemplo n.º 19
0
    def inner_loss(self, episodes, params=None):
        """Compute the inner loss for the one-step gradient update. The inner 
        loss is REINFORCE with baseline [2], computed on advantages estimated 
        with Generalized Advantage Estimation (GAE, [3]).
        The baseline is subtracted from the empirical return to reduce
        variance of the optimization. In here, a linear function as the 
        baseline with a time-varying feature vector is used.
        """
        values = self.baseline(episodes)
        advantages = episodes.gae(values, tau=self.tau)
        advantages = weighted_normalize(advantages, weights=episodes.mask)

        pi = self.policy(episodes.observations, params=params)
        log_probs = pi.log_prob(episodes.actions)
        if log_probs.dim() > 2:
            log_probs = torch.sum(log_probs, dim=2)
        loss = -weighted_mean(log_probs * advantages, dim=0, weights=episodes.mask)

        return loss
    def inner_loss(self, episodes, l_params=None, h_params=None):
        """Compute the inner loss for the one-step gradient update. The inner
        loss is REINFORCE with baseline [2], computed on advantages estimated
        with Generalized Advantage Estimation (GAE, [3]).
        """
        values = self.baseline(episodes)
        advantages = episodes.gae(values, tau=self.tau)
        advantages = weighted_normalize(advantages, weights=episodes.mask)

        # First we calculate the latent space actions from the higher level policy (stored in episodes).
        # Then we calculate the lower level actions using the higher level actions
        pi_higher = self.h_policy(episodes.observations, params=h_params)
        # Calculate the log probability
        log_probs = pi_higher.log_prob(episodes.higher_level_actions)
        if log_probs.dim() > 2:
            log_probs = torch.sum(log_probs, dim=2)
        loss = -weighted_mean(log_probs * advantages, dim=0)

        return loss
Exemplo n.º 21
0
    def inner_loss(self, episodes, params=None):
        """Compute the inner loss for the one-step gradient update. The inner
        loss is REINFORCE with baseline [2], computed on advantages estimated
        with Generalized Advantage Estimation (GAE, [3]).
        https://pytorch.org/docs/0.3.1/distributions.html (except using advantag
        instead of rewards.) Implements eq 4.
        """

        vf_loss = -1
        loss = 0
        if self.baseline_type == 'linear':
            values = self.baseline(episodes)
        elif self.baseline_type == 'critic separate':
            values = self.baseline(episodes.observations)
            # find value loss sum [(R-V(s))^2]
            R = episodes.returns.view([200, 20, 1])
            vf_loss = (((values - R)**2).mean())**(1 / 2)
        #else:
        #    pi,values = self.policy(episodes.observations)
        #    pi,vi = self.policy(episodes.observations,params=params)
        #    log_probs = pi.log_prob(values.size())
        #    loss = (((values - R) ** 2).mean()) ** (1 / 2)

        advantages = episodes.gae(values, tau=self.tau)
        advantages_unnorm = advantages
        sum_adv = torch.sum(advantages_unnorm).numpy()
        logging.info("unnormalized advantages: " + str(sum_adv))
        logging.info("sum of returns:" + str(torch.sum(episodes.returns)))

        advantages = weighted_normalize(advantages, weights=episodes.mask)

        pi = self.policy(episodes.observations, params=params)
        log_probs = pi.log_prob(episodes.actions)
        if log_probs.dim() > 2:
            # sum over all the workers.
            log_probs = torch.sum(log_probs, dim=2)
        loss = loss - weighted_mean(
            log_probs * advantages, dim=0, weights=episodes.mask)
        logging.info("inner loss: " + str(loss))

        return loss, vf_loss
Exemplo n.º 22
0
def test_weighted_mean_no_dim():
    lengths = [2, 3, 5, 7, 11]
    # Inputs
    inputs_np = np.random.rand(5, 13).astype(np.float32)
    weights_np = np.zeros((5, 13), dtype=np.float32)
    for i, length in enumerate(lengths):
        inputs_np[i, length:] = 0.
        weights_np[i, :length] = 1.
    # Pytorch
    inputs_th = torch.from_numpy(inputs_np)
    weights_th = torch.from_numpy(weights_np)
    mean_th = weighted_mean(inputs_th, dim=None, weights=weights_th)
    # Numpy
    sum_np, num_np = 0., 0.
    for i in range(5):
        for j in range(13):
            sum_np += inputs_np[i, j] * weights_np[i, j]
            num_np += weights_np[i, j]
    mean_np = sum_np / num_np

    assert mean_th.dim() == 0
    assert np.allclose(mean_th.item(), mean_np)
def reinforce_loss(policy,
                   episodes,
                   init_std=1.0,
                   min_std=1e-6,
                   output_size=2
                   ):
    output = policy(episodes.observations.view((-1, *episodes.observation_shape)))

    min_log_std = math.log(min_std)
    sigma = nn.Parameter(torch.Tensor(output_size))
    sigma.data.fill_(math.log(init_std))

    scale = torch.exp(torch.clamp(sigma, min=min_log_std))
    pi = Independent(Normal(loc=output, scale=scale), 1)

    log_probs = pi.log_prob(episodes.actions.view((-1, *episodes.action_shape)))
    log_probs = log_probs.view(len(episodes), episodes.batch_size)

    losses = -weighted_mean(log_probs * episodes.advantages,
                            lengths=episodes.lengths)

    return losses.mean()
Exemplo n.º 24
0
    def kl_divergence(self, episodes, old_pis=None):
        """In Trust Region Policy Optimization (TRPO, [4]), the heuristic
        approximation which considers the "average" KL divergence is used instead
        """
        kls = []
        if old_pis is None:
            old_pis = [None] * len(episodes)

        for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis):
            params = self.adapt(train_episodes)
            pi = self.policy(valid_episodes.observations, params=params)

            if old_pi is None:
                old_pi = detach_distribution(pi)

            mask = valid_episodes.mask
            if valid_episodes.actions.dim() > 2:
                mask = mask.unsqueeze(2)
            kl = weighted_mean(kl_divergence(pi, old_pi), dim=0, weights=mask)
            kls.append(kl)

        return torch.mean(torch.stack(kls, dim=0))
Exemplo n.º 25
0
def test_weighted_mean_multi_dimensional():
    lengths = [2, 3, 7, 5, 11]
    # Inputs
    inputs_np = np.random.rand(13, 5, 17, 19).astype(np.float32)
    for i, length in enumerate(lengths):
        inputs_np[length:, i] = 0.

    # Pytorch
    inputs_th = torch.as_tensor(inputs_np)
    mean_th = weighted_mean(inputs_th, lengths=lengths)

    # Numpy
    mean_np = np.zeros((5, 17, 19), dtype=np.float32)
    for i, length in enumerate(lengths):
        for j in range(13):
            if j < length:
                mean_np[i] += inputs_np[j, i]
        mean_np[i] /= length

    assert mean_th.dim() == 3
    assert mean_th.shape == (5, 17, 19)
    np.testing.assert_allclose(mean_th.detach().numpy(), mean_np)
Exemplo n.º 26
0
    def compute_ng_gradient(self,
                            episodes,
                            max_kl=1e-3,
                            cg_iters=20,
                            cg_damping=1e-2,
                            ls_max_steps=10,
                            ls_backtrack_ratio=0.5):
        ng_grads = []
        for train_episodes, valid_episodes in episodes:
            params, step_size, step = self.adapt(train_episodes)

            # compute $grad = \nabla_x J^{lvc}(x) at x = \theta - \eta\UM(\theta)
            pi = self.policy(valid_episodes.observations, params=params)
            pi_detach = detach_distribution(pi)

            values = self.baseline(valid_episodes)
            advantages = valid_episodes.gae(values, tau=self.tau)
            advantages = weighted_normalize(advantages,
                                            weights=valid_episodes.mask)

            log_ratio = pi.log_prob(
                valid_episodes.actions) - pi_detach.log_prob(
                    valid_episodes.actions)
            if log_ratio.dim() > 2:
                log_ratio = torch.sum(log_ratio, dim=2)
            ratio = torch.exp(log_ratio)

            loss = -weighted_mean(
                ratio * advantages, dim=0, weights=valid_episodes.mask)

            ng_grad_0 = torch.autograd.grad(
                loss, self.policy.parameters())  # no create graph
            ng_grad_0 = parameters_to_vector(ng_grad_0)
            # compute the inverse of Fihser matrix at x=\theta times $grad with Conjugate Gradient
            hessian_vector_product = self.hessian_vector_product_ng(
                train_episodes, damping=cg_damping)
            F_inv_grad = conjugate_gradient(hessian_vector_product,
                                            ng_grad_0,
                                            cg_iters=cg_iters)

            # compute $ng_grad_1 = \nabla^2 J^{lvc}(x) at x = \theta times $F_inv_grad
            # create graph for higher differential
            # self.baseline.fit(train_episodes)
            loss = self.inner_loss(train_episodes)
            grad = torch.autograd.grad(loss,
                                       self.policy.parameters(),
                                       create_graph=True)
            grad = parameters_to_vector(grad)
            grad_F_inv_grad = torch.dot(grad, F_inv_grad.detach())
            ng_grad_1 = torch.autograd.grad(grad_F_inv_grad,
                                            self.policy.parameters())
            ng_grad_1 = parameters_to_vector(ng_grad_1)
            # compute $ng_grad_2 = the Jocobian of {F(x) U(\theta)} at x = \theta times $F_inv_grad
            hessian_vector_product = self.hessian_vector_product_ng(
                train_episodes, damping=cg_damping)
            F_U = hessian_vector_product(step)
            ng_grad_2 = torch.autograd.grad(
                torch.dot(F_U, F_inv_grad.detach()), self.policy.parameters())
            ng_grad_2 = parameters_to_vector(ng_grad_2)
            ng_grad = ng_grad_0 - step_size * (ng_grad_1 + ng_grad_2)

            ng_grad = parameters_to_vector(ng_grad)
            ng_grads.append(ng_grad.view(len(ng_grad), 1))

        return torch.mean(torch.stack(ng_grads, dim=1), dim=[1, 2])
Exemplo n.º 27
0
    def surrogate_loss(self, episodes, old_pis=None):
        """
        Surrogate objective:
        E_r SmoothReLU( V_r^{adapted self.policy} - \max_{\pi \in self.policies[0:policy_idx - 1} V_r^\pi)

        V_r^{adapted self.policy} can be evaluated by valid_episodes in episodes
        \max_{\pi \in self.policies[0:policy_idx - 1} V_r^\pi is computed in self.values_of_optimized_policies

        :param episodes: [(episodes before adapting, episodes after adapting) for task in sampled tasks]
        :param old_pis: dummy parameter derived from super
        :return: mean of losses, mean of kls, pis
        """
        losses, kls, pis = [], [], []
        if old_pis is None:
            old_pis = [None] * len(episodes)

        for episode_index in range(len(episodes)):
            (train_episodes, valid_episodes) = episodes[episode_index]
            old_pi = old_pis[episode_index]

            if self.current_policy_idx == 0:
                dominance_correction = 1
            else:
                difference_from_best_value = total_rewards(
                    valid_episodes.rewards
                ) - self.values_of_optimized_policies[episode_index]
                dominance_correction = 1 - 1 / (
                    1 + math.exp(difference_from_best_value))

            params = self.adapt(train_episodes)
            with torch.set_grad_enabled(old_pi is None):
                pi = self.policy(valid_episodes.observations, params=params)
                pis.append(detach_distribution(pi))

                if old_pi is None:
                    old_pi = detach_distribution(pi)

                values = self.baseline(valid_episodes)
                advantages = valid_episodes.gae(values, tau=self.tau)
                advantages = weighted_normalize(advantages,
                                                weights=valid_episodes.mask)

                log_ratio = (pi.log_prob(valid_episodes.actions) -
                             old_pi.log_prob(valid_episodes.actions))
                if log_ratio.dim() > 2:
                    log_ratio = torch.sum(log_ratio, dim=2)
                ratio = torch.exp(log_ratio)

                loss = -dominance_correction * weighted_mean(
                    ratio * advantages, dim=0, weights=valid_episodes.mask)
                losses.append(loss)

                mask = valid_episodes.mask
                if valid_episodes.actions.dim() > 2:
                    mask = mask.unsqueeze(2)
                kl = weighted_mean(kl_divergence(pi, old_pi),
                                   dim=0,
                                   weights=mask)
                kls.append(kl)

        if len(losses) == 0 or len(kls) == 0:
            # signal outside that no losses. avoiding taking mean of empty tensors..
            return (None, None, pis)
        else:
            return (torch.mean(torch.stack(losses, dim=0)),
                    torch.mean(torch.stack(kls, dim=0)), pis)
Exemplo n.º 28
0
    def inner_loss_ppo_noUpdate(self,
                                episodes,
                                first_order,
                                params=None,
                                ent_coef=0,
                                vf_coef=0,
                                nenvs=1):
        """Compute the inner loss for the one-step gradient update. The inner
        loss is PPO with clipped ratio = new_pi/old_pi.
        Can make cliprange adaptable.
        nenvs = number of workers. nsteps defined in env
        """
        #episodes = [num of steps, num of episodes, obs_space]
        #NEED TO CHANGE ADVANTAGE CALCULATION TO CRITIC.
        losses = []

        self.logger.info("cliprange: " + str(self.cliprange) +
                         "; noptepochs: " + str(self.noptepochs) +
                         ";nminibaches: " + str(self.nminibatches) +
                         ";ppo_lr: " + str(self.ppo_lr))
        # Save the old parameters
        old_policy = copy.deepcopy(self.policy)
        old_params = parameters_to_vector(old_policy.parameters())

        #Need to take mini-batch of sampled examples to do gradient update a few times.
        nepisodes = episodes.observations.shape[1]
        nsteps = episodes.observations.shape[0]
        nbatch = nenvs * nsteps * nepisodes
        nbatch_train = nbatch // self.nminibatches
        mblossvals = []

        #Flattern the episode to [steps, observations]
        episodes_flat = BatchEpisodes(batch_size=nbatch)
        i = 0
        for ep in range(nepisodes):
            for step in range(nsteps):
                episodes_flat.append([episodes.observations[step][ep].numpy()],
                                     [episodes.actions[step][ep].numpy()],
                                     [episodes.returns[step][ep].numpy()],
                                     (i, ))
                i += 1

        inds = np.arange(nbatch)

        # For the case with linear baseline.
        vf_loss = -1

        for epoch in range(self.noptepochs):

            # Randomize the indexes
            #np.random.shuffle(inds)
            mb_vf_loss = torch.zeros(1)
            grad_norm = []
            # 0 to batch_size with batch_train_size step
            for start in range(0, nbatch, nbatch_train):

                mb_obs, mb_returns, mb_masks, mb_actions = [], [], [], []
                mb_episodes = BatchEpisodes(batch_size=nbatch_train)

                end = start + nbatch_train
                mbinds = inds[start:end]

                for i in range(len(mbinds)):
                    mb_obs.append(
                        episodes_flat.observations[0][mbinds[i]].numpy())
                    mb_returns.append(
                        episodes_flat.returns[0][mbinds[i]].numpy())
                    mb_masks.append(episodes_flat.mask[0][mbinds[i]].numpy())
                    mb_actions.append(
                        episodes_flat.actions[0][mbinds[i]].numpy())
                    mb_episodes.append([mb_obs[i]], [mb_actions[i]],
                                       [mb_returns[i]], (i, ))

                if self.baseline_type == 'linear':
                    values = self.baseline(mb_episodes)
                elif self.baseline_type == 'critic separate':
                    values = self.baseline(mb_episodes.observations)
                    # find value loss sum [(R-V(s))^2]
                    R = torch.FloatTensor(np.array(mb_returns))
                    mb_vf_loss = (((values - R)**2).mean()) + mb_vf_loss

                #values = self.baseline(mb_episodes)

                advantages = mb_episodes.gae(values, tau=self.tau)
                advantages_unnorm = advantages
                advantages = weighted_normalize(advantages.type(torch.float32),
                                                weights=torch.ones(
                                                    1, advantages.shape[1]))

                mb_returns_sum = np.sum(mb_returns)
                self.logger.info("iter: " + "epoch:" + str(epoch) + "; mb:" +
                                 str(start / nbatch_train))
                self.logger.info("mb returns: " + str(mb_returns_sum))

                pi = self.policy(mb_episodes.observations)
                log_probs = pi.log_prob(mb_episodes.actions)

                #reload old policy.
                vector_to_parameters(old_params, old_policy.parameters())
                pi_old = old_policy(mb_episodes.observations)

                log_probs_old = pi_old.log_prob(mb_episodes.actions)

                if log_probs.dim() > 2:
                    log_probs_old = torch.sum(log_probs_old, dim=2)
                    log_probs = torch.sum(log_probs, dim=2)

                ratio = torch.exp(log_probs - log_probs_old)

                self.logger.info("max pi: ")
                self.logger.info(torch.max(pi.mean))

                for x in ratio[0][:10]:
                    if x > 1E5 or x < 1E-5:
                        #pdb.set_trace()
                        self.logger.info("ratio too large or too small.")
                        self.logger.info(ratio[0][:10])

                self.logger.info("policy ratio: ")
                self.logger.info(ratio[0][:10])

                #loss function
                pg_losses = -advantages * ratio
                pg_losses2 = -advantages * torch.clamp(
                    ratio, 1.0 - self.cliprange, 1.0 + self.cliprange)

                # Final PG loss
                pg_loss = weighted_mean(torch.max(pg_losses, pg_losses2),
                                        weights=torch.ones(
                                            1, advantages.shape[1]))

                self.logger.debug("policy mu weights: ")
                self.logger.debug(self.policy.mu.weight)

                sum_adv = torch.sum(advantages_unnorm).numpy()
                self.logger.info("unnormalized advantages: " + str(sum_adv))

                # Total loss
                loss = pg_loss

                self.logger.info("max_action: " + str(np.max(mb_actions)))
                self.logger.info("max_action index: " +
                                 str(np.argmax(mb_actions)))

                # Save the old parameters
                old_params = parameters_to_vector(self.policy.parameters())
                losses.append(loss)

        self.logger.info("inner loss for each mb and epoch: ")
        self.logger.info(mblossvals)
        return torch.mean(torch.stack(losses, dim=0))