Python AbstractGaussianPolicy示例，trust_region_projections.models.policy.abstract_gaussian_policy.AbstractGaussianPolicy Python示例

示例#1

0

显示文件

def gaussian_frobenius(policy: AbstractGaussianPolicy, p: Tuple[ch.Tensor, ch.Tensor], q: Tuple[ch.Tensor, ch.Tensor],
                       scale_prec: bool = False, return_cov: bool = False) \
        -> Union[Tuple[ch.Tensor, ch.Tensor], Tuple[ch.Tensor, ch.Tensor, ch.Tensor, ch.Tensor]]:
    """
    Compute (p - q) (L_oL_o^T)^-1 (p - 1)^T + |LL^T - L_oL_o^T|_F^2 with p,q ~ N(y, LL^T)
    Args:
        policy: current policy
        p: mean and chol of gaussian p
        q: mean and chol of gaussian q
        return_cov: return cov matrices for further computations
        scale_prec: scale objective with precision matrix

    Returns: mahalanobis distance, squared frobenius norm

    """
    mean, chol = p
    mean_other, chol_other = q

    mean_part = mean_distance(policy, mean, mean_other, chol_other, scale_prec)

    # frob objective for cov
    cov_other = policy.covariance(chol_other)
    cov = policy.covariance(chol)
    diff = cov_other - cov
    # Matrix is real symmetric PSD, therefore |A @ A^H|^2_F = tr{A @ A^H} = tr{A @ A}
    cov_part = torch_batched_trace(diff @ diff)

    if return_cov:
        return mean_part, cov_part, cov, cov_other

    return mean_part, cov_part

示例#2

0

显示文件

def gaussian_kl(policy: AbstractGaussianPolicy, p: Tuple[ch.Tensor, ch.Tensor],
                q: Tuple[ch.Tensor, ch.Tensor]) -> Tuple[ch.Tensor, ch.Tensor]:
    """
    Get the expected KL divergence between two sets of Gaussians over states -
    Calculates E KL(p||q): E[sum p(x) log(p(x)/q(x))] in closed form for Gaussians.

    Args:
        policy: policy instance
        p: first distribution tuple (mean, var)
        q: second distribution tuple (mean, var)

    Returns:

    """

    mean, std = p
    mean_other, std_other = q
    k = mean.shape[-1]

    det_term = policy.log_determinant(std)
    det_term_other = policy.log_determinant(std_other)

    cov = policy.covariance(std)
    prec_other = policy.precision(std_other)

    maha_part = .5 * policy.maha(mean, mean_other, std_other)
    # trace_part = (var * precision_other).sum([-1, -2])
    trace_part = torch_batched_trace(prec_other @ cov)
    cov_part = .5 * (trace_part - k + det_term_other - det_term)

    return maha_part, cov_part

示例#3

0

显示文件

def gaussian_wasserstein_non_commutative(policy: AbstractGaussianPolicy, p: Tuple[ch.Tensor, ch.Tensor],
                                         q: Tuple[ch.Tensor, ch.Tensor], scale_prec=False,
                                         return_eig=False) -> Union[Tuple[ch.Tensor, ch.Tensor],
                                                                    Tuple[ch.Tensor, ch.Tensor, ch.Tensor, ch.Tensor]]:
    """
    Compute mean part and cov part of W_2(p || q) with p,q ~ N(y, SS)
    This version DOES NOT assume commutativity of both distributions, i.e. covariance matrices.
    This is more general an does not make any assumptions.
    When scale_prec is true scale both distributions with old precision matrix.
    Args:
        policy: current policy
        p: mean and sqrt of gaussian p
        q: mean and sqrt of gaussian q
        scale_prec: scale objective by old precision matrix.
                    This penalizes directions based on old uncertainty/covariance.
        return_eig: return eigen decomp for further computation

    Returns: mean part of W2, cov part of W2

    """
    mean, sqrt = p
    mean_other, sqrt_other = q
    batch_dim, dim = mean.shape

    mean_part = mean_distance(policy, mean, mean_other, sqrt_other, scale_prec)

    cov = policy.covariance(sqrt)

    if scale_prec:
        # cov constraint scaled with precision of old dist
        # W2 objective for cov assuming normal W2 objective for mean
        identity = ch.eye(dim, dtype=sqrt.dtype, device=sqrt.device)
        sqrt_inv_other = ch.solve(identity, sqrt_other)[0]
        c = sqrt_inv_other @ cov @ sqrt_inv_other

        # compute inner parenthesis of trace in W2,
        # Only consider lower triangular parts, given cov/sqrt(cov) is symmetric PSD.
        eigvals, eigvecs = ch.symeig(c, eigenvectors=return_eig, upper=False)
        # make use of the following property to compute the trace of the root: 𝐴^2𝑥=𝐴(𝐴𝑥)=𝐴𝜆𝑥=𝜆(𝐴𝑥)=𝜆^2𝑥
        cov_part = torch_batched_trace(identity + c) - 2 * eigvals.sqrt().sum(1)

    else:
        # W2 objective for cov assuming normal W2 objective for mean
        cov_other = policy.covariance(sqrt_other)

        # compute inner parenthesis of trace in W2,
        # Only consider lower triangular parts, given cov/sqrt(cov) is symmetric PSD.
        eigvals, eigvecs = ch.symeig(cov @ cov_other, eigenvectors=return_eig, upper=False)
        # make use of the following property to compute the trace of the root: 𝐴^2𝑥=𝐴(𝐴𝑥)=𝐴𝜆𝑥=𝜆(𝐴𝑥)=𝜆^2𝑥
        cov_part = torch_batched_trace(cov_other + cov) - 2 * eigvals.sqrt().sum(1)

    if return_eig:
        return mean_part, cov_part, eigvals, eigvecs

    return mean_part, cov_part

示例#4

0

显示文件

def constraint_values(proj_type, policy: AbstractGaussianPolicy, p: Tuple[ch.Tensor, ch.Tensor],
                      q: Tuple[ch.Tensor, ch.Tensor], scale_prec: bool = True):
    """
    Computes the relevant metrics for a given batch of predictions.
    Args:
        proj_type: type of projection to compute the metrics for
        policy: current policy
        p: mean and std of gaussian p
        q: mean and std of gaussian q
        scale_prec: for W2 projection, use version scaled with precision matrix

    Returns: entropy, mean_part, cov_part, kl

    """
    if proj_type == "w2":
        mean_part, cov_part = gaussian_wasserstein_commutative(policy, p, q, scale_prec=scale_prec)

    elif proj_type == "w2_non_com":
        # For this case only the sum is relevant, no individual projections for mean and std make sense
        mean_part, cov_part = gaussian_wasserstein_non_commutative(policy, p, q, scale_prec=scale_prec)

    elif proj_type == "frob":
        mean_part, cov_part = gaussian_frobenius(policy, p, q, scale_prec=scale_prec)

    else:
        # we assume kl projection as default (this is also true for PPO)
        mean_part, cov_part = gaussian_kl(policy, p, q)

    entropy = policy.entropy(p)
    mean_kl, cov_kl = gaussian_kl(policy, p, q)
    kl = mean_kl + cov_kl

    return entropy, mean_part, cov_part, kl

示例#5

0

显示文件

文件： base_projection_layer.py 项目： boschresearch/trust-region-layers

def entropy_inequality_projection(policy: AbstractGaussianPolicy,
                                  p: Tuple[ch.Tensor, ch.Tensor],
                                  beta: Union[float, ch.Tensor]):
    """
    Projects std to satisfy an entropy INEQUALITY constraint.
    Args:
        policy: policy instance
        p: current distribution
        beta: target entropy for EACH std or general bound for all stds

    Returns:
        projected std that satisfies the entropy bound
    """
    mean, std = p
    k = std.shape[-1]
    batch_shape = std.shape[:-2]

    ent = policy.entropy(p)
    mask = ent < beta

    # if nothing has to be projected skip computation
    if (~mask).all():
        return p

    alpha = ch.ones(batch_shape, dtype=std.dtype, device=std.device)
    alpha[mask] = ch.exp((beta[mask] - ent[mask]) / k)

    proj_std = ch.einsum('ijk,i->ijk', std, alpha)
    return mean, ch.where(mask[..., None, None], proj_std, std)

示例#6

0

显示文件

    def evaluate_policy(self,
                        policy: AbstractGaussianPolicy,
                        render: bool = False,
                        deterministic: bool = True):
        """
        Evaluate a given policy
        Args:
            policy: policy to evaluate
            render: render policy behavior
            deterministic: choosing deterministic actions

        Returns:
            Dict with performance metrics.
        """
        if self.n_test_envs == 0:
            return
        n_runs = 1
        ep_rewards = np.zeros((
            n_runs,
            self.n_test_envs,
        ))
        ep_lengths = np.zeros((
            n_runs,
            self.n_test_envs,
        ))

        for i in range(n_runs):
            not_dones = np.ones((self.n_test_envs, ), np.bool)
            obs = self.envs.reset_test()
            while np.any(not_dones):
                ep_lengths[i, not_dones] += 1
                if render:
                    self.envs.render_test(mode="human")
                with ch.no_grad():
                    p = policy(tensorize(obs, self.cpu, self.dtype))
                    actions = p[0] if deterministic else policy.sample(p)
                    actions = policy.squash(actions)
                obs, rews, dones, infos = self.envs.step_test(
                    get_numpy(actions))
                ep_rewards[i, not_dones] += rews[not_dones]

                # only set to False when env has never terminated before, otherwise we favor earlier terminating envs.
                not_dones = np.logical_and(~dones, not_dones)

        return self.get_reward_dict(ep_rewards, ep_lengths)

示例#7

0

显示文件

文件： kl_projection_layer.py 项目： boschresearch/trust-region-layers

    def _trust_region_projection(self, policy: AbstractGaussianPolicy, p: Tuple[ch.Tensor, ch.Tensor],
                                 q: Tuple[ch.Tensor, ch.Tensor], eps: ch.Tensor, eps_cov: ch.Tensor, **kwargs):
        """
        Runs KL projection layer and constructs cholesky of covariance
        Args:
            policy: policy instance
            p: current distribution
            q: old distribution
            eps: (modified) kl bound/ kl bound for mean part
            eps_cov: (modified) kl bound for cov part
            **kwargs:

        Returns:
            projected mean, projected cov cholesky
        """
        mean, std = p
        old_mean, old_std = q

        if not policy.contextual_std:
            # only project first one to reduce number of numerical optimizations
            std = std[:1]
            old_std = old_std[:1]

        ################################################################################################################
        # project mean with closed form
        mean_part, _ = gaussian_kl(policy, p, q)
        proj_mean = mean_projection(mean, old_mean, mean_part, eps)

        cov = policy.covariance(std)
        old_cov = policy.covariance(old_std)

        if policy.is_diag:
            proj_cov = KLProjectionGradFunctionDiagCovOnly.apply(cov.diagonal(dim1=-2, dim2=-1),
                                                                 old_cov.diagonal(dim1=-2, dim2=-1),
                                                                 eps_cov)
            proj_std = proj_cov.sqrt().diag_embed()
        else:
            raise NotImplementedError("The KL projection currently does not support full covariance matrices.")

        if not policy.contextual_std:
            # scale first std back to batchsize
            proj_std = proj_std.expand(mean.shape[0], -1, -1)

        return proj_mean, proj_std

示例#8

0

显示文件

def gaussian_wasserstein_commutative(policy: AbstractGaussianPolicy, p: Tuple[ch.Tensor, ch.Tensor],
                                     q: Tuple[ch.Tensor, ch.Tensor], scale_prec=False) -> Tuple[ch.Tensor, ch.Tensor]:
    """
    Compute mean part and cov part of W_2(p || q) with p,q ~ N(y, SS).
    This version DOES assume commutativity of both distributions, i.e. covariance matrices.
    This is less general and assumes both distributions are somewhat close together.
    When scale_prec is true scale both distributions with old precision matrix.
    Args:
        policy: current policy
        p: mean and sqrt of gaussian p
        q: mean and sqrt of gaussian q
        scale_prec: scale objective by old precision matrix.
                    This penalizes directions based on old uncertainty/covariance.

    Returns: mean part of W2, cov part of W2

    """
    mean, sqrt = p
    mean_other, sqrt_other = q

    mean_part = mean_distance(policy, mean, mean_other, sqrt_other, scale_prec)

    cov = policy.covariance(sqrt)
    if scale_prec:
        # cov constraint scaled with precision of old dist
        batch_dim, dim = mean.shape

        identity = ch.eye(dim, dtype=sqrt.dtype, device=sqrt.device)
        sqrt_inv_other = ch.solve(identity, sqrt_other)[0]
        c = sqrt_inv_other @ cov @ sqrt_inv_other

        cov_part = torch_batched_trace(identity + c - 2 * sqrt_inv_other @ sqrt)

    else:
        # W2 objective for cov assuming normal W2 objective for mean
        cov_other = policy.covariance(sqrt_other)
        cov_part = torch_batched_trace(cov_other + cov - 2 * sqrt_other @ sqrt)

    return mean_part, cov_part

示例#9

0

显示文件

文件： base_projection_layer.py 项目： boschresearch/trust-region-layers

def entropy_equality_projection(policy: AbstractGaussianPolicy,
                                p: Tuple[ch.Tensor, ch.Tensor],
                                beta: Union[float, ch.Tensor]):
    """
    Projects std to satisfy an entropy EQUALITY constraint.
    Args:
        policy: policy instance
        p: current distribution
        beta: target entropy for EACH std or general bound for all stds

    Returns:
        projected std that satisfies the entropy bound
    """
    mean, std = p
    k = std.shape[-1]

    ent = policy.entropy(p)
    alpha = ch.exp((beta - ent) / k)
    proj_std = ch.einsum('ijk,i->ijk', std, alpha)
    return mean, proj_std

示例#10

0

显示文件

    def run(self,
            rollout_steps,
            policy: AbstractGaussianPolicy,
            vf_model: Union[VFNet, None] = None,
            reset_envs: bool = False) -> TrajectoryOnPolicyRaw:
        """
        Generate trajectories of the environment.
        Args:
            rollout_steps: Number of steps to generate
            policy: Policy model to generate samples for
            vf_model: vf model to generate value estimate for all states.
            reset_envs: Whether to reset all envs in the beginning.

        Returns:
            Trajectory with the respective data as torch tensors.
        """

        # Here, we init the lists that will contain the mb of experiences
        num_envs = self.n_envs

        base_shape = (rollout_steps, num_envs)
        base_shape_p1 = (rollout_steps + 1, num_envs)
        base_action_shape = base_shape + self.envs.action_space.shape

        mb_obs = ch.zeros(base_shape_p1 + self.envs.observation_space.shape,
                          dtype=self.dtype)
        mb_actions = ch.zeros(base_action_shape, dtype=self.dtype)
        mb_rewards = ch.zeros(base_shape, dtype=self.dtype)
        mb_dones = ch.zeros(base_shape, dtype=ch.bool)
        ep_infos = []

        mb_time_limit_dones = ch.zeros(base_shape, dtype=ch.bool)
        mb_means = ch.zeros(base_action_shape, dtype=self.dtype)
        mb_stds = ch.zeros(base_action_shape + self.envs.action_space.shape,
                           dtype=self.dtype)

        # continue from last state
        # Before first step we already have self.obs because env calls self.obs = env.reset() on init
        obs = self.envs.reset() if reset_envs else self.envs.last_obs
        obs = tensorize(obs, self.cpu, self.dtype)

        # For n in range number of steps
        for i in range(rollout_steps):
            # Given observations, get action value and lopacs
            pds = policy(obs, train=False)
            actions = policy.sample(pds)
            squashed_actions = policy.squash(actions)

            mb_obs[i] = obs
            mb_actions[i] = squashed_actions

            obs, rewards, dones, infos = self.envs.step(
                squashed_actions.cpu().numpy())
            obs = tensorize(obs, self.cpu, self.dtype)

            mb_means[i] = pds[0]
            mb_stds[i] = pds[1]
            mb_time_limit_dones[i] = tensorize(infos["horizon"], self.cpu,
                                               ch.bool)

            if infos.get("done"):
                ep_infos.extend(infos.get("done"))

            mb_rewards[i] = tensorize(rewards, self.cpu, self.dtype)
            mb_dones[i] = tensorize(dones, self.cpu, ch.bool)

        # need value prediction for last obs in rollout to estimate loss
        mb_obs[-1] = obs

        # compute all logpacs and value estimates at once --> less computation
        mb_logpacs = policy.log_probability((mb_means, mb_stds), mb_actions)
        mb_values = (vf_model if vf_model else policy.get_value)(mb_obs,
                                                                 train=False)

        out = (mb_obs[:-1], mb_actions, mb_logpacs, mb_rewards, mb_values,
               mb_dones, mb_time_limit_dones, mb_means, mb_stds)

        if not self.cpu:
            out = tuple(map(to_gpu, out))

        if ep_infos:
            ep_infos = np.array(ep_infos)
            ep_length, ep_reward = ep_infos[:, 0], ep_infos[:, 1]
            self.total_rewards.extend(ep_reward)
            self.total_steps.extend(ep_length)

        return TrajectoryOnPolicyRaw(*out)

示例#11

0

显示文件

    def _trust_region_projection(self, policy: AbstractGaussianPolicy,
                                 p: Tuple[ch.Tensor, ch.Tensor],
                                 q: Tuple[ch.Tensor,
                                          ch.Tensor], eps: Union[ch.Tensor,
                                                                 float],
                                 eps_cov: Union[ch.Tensor, float], **kwargs):
        """
        runs papi projection layer and constructs sqrt of covariance
        Args:
            policy: policy instance
            p: current distribution
            q: old distribution
            eps: (modified) kl bound/ kl bound for mean part
            eps_cov: (modified) kl bound for cov part
            **kwargs:

        Returns:
            mean, cov sqrt
        """

        mean, chol = p
        old_mean, old_chol = q
        intermed_mean = kwargs.get('intermed_mean')

        dtype = mean.dtype
        device = mean.device

        dim = mean.shape[-1]

        ################################################################################################################
        # Precompute basic matrices

        # Joint bound
        eps += eps_cov

        I = ch.eye(dim, dtype=dtype, device=device)
        old_precision = ch.cholesky_solve(I, old_chol)[0]
        logdet_old = policy.log_determinant(old_chol)
        cov = policy.covariance(chol)

        ################################################################################################################
        # compute expected KL
        maha_part, cov_part = gaussian_kl(policy, p, q)
        maha_part = maha_part.mean()
        cov_part = cov_part.mean()

        if intermed_mean is not None:
            maha_intermediate = 0.5 * policy.maha(intermed_mean, old_mean,
                                                  old_chol).mean()
            mm = ch.min(maha_part, maha_intermediate)

        ################################################################################################################
        # matrix rotation/rescaling projection
        if maha_part + cov_part > eps + 1e-6:
            old_cov = policy.covariance(old_chol)

            maha_delta = eps if intermed_mean is None else (eps - mm)
            eta_rot = maha_delta / ch.max(
                maha_part + cov_part,
                ch.tensor(1e-16, dtype=dtype, device=device))
            new_cov = (1 - eta_rot) * old_cov + eta_rot * cov
            proj_chol = ch.cholesky(new_cov)

            # recompute covariance part of KL for new chol
            trace_term = 0.5 * (torch_batched_trace(old_precision @ new_cov) -
                                dim).mean()  # rotation difference
            entropy_diff = 0.5 * (logdet_old -
                                  policy.log_determinant(proj_chol)).mean()

            cov_part = trace_term + entropy_diff

        else:
            proj_chol = chol

        ################################################################################################################
        # mean interpolation projection
        if maha_part + cov_part > eps + 1e-6:

            if intermed_mean is not None:
                a = 0.5 * policy.maha(mean, intermed_mean, old_chol).mean()
                b = 0.5 * ((mean - intermed_mean) @ old_precision
                           @ (intermed_mean - old_mean).T).mean()
                c = maha_intermediate - ch.max(
                    eps - cov_part, ch.tensor(0., dtype=dtype, device=device))
                eta_mean = (-b + ch.sqrt(ch.max(b * b - a * c, ch.tensor(1e-16, dtype=dtype, device=device)))) / \
                           ch.max(a, ch.tensor(1e-16, dtype=dtype, device=device))
            else:
                eta_mean = ch.sqrt(
                    ch.max(eps - cov_part,
                           ch.tensor(1e-16, dtype=dtype, device=device)) /
                    ch.max(maha_part,
                           ch.tensor(1e-16, dtype=dtype, device=device)))
        else:
            eta_mean = ch.tensor(1., dtype=dtype, device=device)

        return eta_mean, proj_chol

示例#12

0

显示文件

    def _papi_steps(self,
                    policy: AbstractGaussianPolicy,
                    q: Tuple[ch.Tensor, ch.Tensor],
                    obs: ch.Tensor,
                    lr_schedule,
                    lr_schedule_vf=None):
        """
        Take PAPI steps after PPO finished its steps. Policy parameters are updated in-place.
        Args:
            policy: policy instance
            q: old distribution
            obs: collected observations from trajectories
            lr_schedule: lr schedule for policy
            lr_schedule_vf: lr schedule for vf

        Returns:

        """
        assert not policy.contextual_std

        # save latest policy in history
        self.last_policies.append(copy.deepcopy(policy))

        ################################################################################################################
        # policy backtracking: out of last n policies and current one find one that satisfies the kl constraint

        intermed_policy = None
        n_backtracks = 0

        for i, pi in enumerate(reversed(self.last_policies)):
            p_prime = pi(obs)
            mean_part, cov_part = pi.kl_divergence(p_prime, q)
            if (mean_part +
                    cov_part).mean() <= self.mean_bound + self.cov_bound:
                intermed_policy = pi
                n_backtracks = i
                break

        ################################################################################################################
        # LR update

        # reduce learning rate when appropriate policy not within the last 4 epochs
        if n_backtracks >= 4 or intermed_policy is None:
            # Linear learning rate annealing
            lr_schedule.step()
            if lr_schedule_vf:
                lr_schedule_vf.step()

        if intermed_policy is None:
            # pop last policy and make it current one, as the updated one was poor
            # do not keep last policy in history, otherwise we could stack the same policy multiple times.
            if len(self.last_policies) >= 1:
                policy.load_state_dict(self.last_policies.pop().state_dict())
                logger.warning(
                    f"No suitable policy found in backtracking of {len(self.last_policies)} policies."
                )
            return

        ################################################################################################################
        # PAPI iterations

        # We assume only non contextual covariances here, therefore we only need to project for one
        q = (q[0], q[1][:1])  # (means, covs[:1])

        # This is A from Alg. 2 [Akrour et al., 2019]
        intermed_weight = intermed_policy.get_last_layer().detach().clone()
        # This is A @ phi(s)
        intermed_mean = p_prime[0].detach().clone()

        entropy = policy.entropy(q)
        entropy_bound = obs.new_tensor([-np.inf]) if entropy / self.initial_entropy > 0.5 \
            else entropy - (self.mean_bound + self.cov_bound)

        for _ in range(20):
            eta, proj_chol = self._projection(intermed_policy,
                                              (p_prime[0], p_prime[1][:1]),
                                              q,
                                              self.mean_bound,
                                              self.cov_bound,
                                              entropy_bound,
                                              intermed_mean=intermed_mean)
            intermed_policy.papi_weight_update(eta, intermed_weight)
            intermed_policy.set_std(proj_chol[0])
            p_prime = intermed_policy(obs)

        policy.load_state_dict(intermed_policy.state_dict())

示例#13

0

显示文件

文件： base_projection_layer.py 项目： boschresearch/trust-region-layers

    def trust_region_regression(self, policy: AbstractGaussianPolicy,
                                obs: ch.Tensor, q: Tuple[ch.Tensor, ch.Tensor],
                                n_minibatches: int, global_steps: int):
        """
        Take additional regression steps to match projection output and policy output.
        The policy parameters are updated in-place.
        Args:
            policy: policy instance
            obs: collected observations from trajectories
            q: old distributions
            n_minibatches: split the rollouts into n_minibatches.
            global_steps: current number of steps, required for projection
        Returns:
            dict with mean of regession loss
        """

        if not self.do_regression:
            return {}

        policy_unprojected = copy.deepcopy(policy)
        optim_reg = get_optimizer(self.optimizer_type_reg,
                                  policy_unprojected.parameters(),
                                  learning_rate=self.lr_reg)
        optim_reg.reset()

        reg_losses = obs.new_tensor(0.)

        # get current projected values --> targets for regression
        p_flat = policy(obs)
        p_target = self(policy, p_flat, q, global_steps)

        for _ in range(self.regression_iters):
            batch_indices = generate_minibatches(obs.shape[0], n_minibatches)

            # Minibatches SGD
            for indices in batch_indices:
                batch = select_batch(indices, obs, p_target[0], p_target[1])
                b_obs, b_target_mean, b_target_std = batch
                proj_p = (b_target_mean.detach(), b_target_std.detach())

                p = policy_unprojected(b_obs)

                # invert scaling with coeff here as we do not have to balance with other losses
                loss = self.get_trust_region_loss(
                    policy, p, proj_p) / self.trust_region_coeff

                optim_reg.zero_grad()
                loss.backward()
                optim_reg.step()
                reg_losses += loss.detach()

        policy.load_state_dict(policy_unprojected.state_dict())

        if not policy.contextual_std:
            # set policy with projection value.
            # In non-contextual cases we have only one cov, so the projection is the same.
            policy.set_std(p_target[1][0])

        steps = self.regression_iters * (math.ceil(
            obs.shape[0] / n_minibatches))
        return {"regression_loss": (reg_losses / steps).detach()}