def _fvsp(policy_net, states, damping=1e-2, device='cpu'): pi = policy_net(states) pi_detach = detach_distribution(pi) kl = torch.mean(kl_divergence(pi_detach, pi)) grads = torch.autograd.grad(kl, policy_net.parameters(), create_graph=True, retain_graph=True) flat_grad_kl = torch.cat([grad.view(-1) for grad in grads]) def __fvsp(vectors, damping=damping): vectors = torch.from_numpy(vectors).to(device) results = [] vector_list = [] if len(vectors.shape) > 1: for j in range(vectors.shape[1]): vector_list.append(torch.squeeze(vectors[:, j])) else: vector_list.append(vectors) for vector in vector_list: kl_v = (flat_grad_kl * vector).sum() grads = torch.autograd.grad(kl_v, policy_net.parameters(), retain_graph=True) flat_grad_grad_kl = torch.cat([grad.view(-1) for grad in grads]) results.append((flat_grad_grad_kl + vector * damping)) return np.squeeze(torch.stack(results, dim=1).cpu().numpy()) return __fvsp
def __fvp(vector, damping=damping): pi = policy_net(states) pi_detach = detach_distribution(pi) kl = torch.mean(kl_divergence(pi_detach, pi)) grads = torch.autograd.grad(kl, policy_net.parameters(), create_graph=True) flat_grad_kl = torch.cat([grad.view(-1) for grad in grads]) kl_v = (flat_grad_kl * vector).sum() grads = torch.autograd.grad(kl_v, policy_net.parameters()) flat_grad_grad_kl = torch.cat([grad.view(-1) for grad in grads]) return flat_grad_grad_kl + vector * damping
def _compute_kl(policy_net, states): pi = policy_net(states) pi_detach = detach_distribution(pi) kl = torch.mean(kl_divergence(pi_detach, pi)) return kl