def parameter_dual(self, beta): agcost = self.parameter_augment_cost(beta) # # initialize adversial xdist with wide param dist. # param = MatrixNormalParameters(self.dm_state, self.dm_act, self.nb_steps) # q_xdist, _, _ = self.cubature_forward_pass(self.ctl, param) # initial adversial xdist. with policy xdist. q_xdist = deepcopy(self.xdist) # first iteration to establish initial conv_kl param, xvalue, diverge = self.parameter_backward_pass( beta, agcost, q_xdist) if diverge != -1: return np.nan, np.nan p_xdist, _, _ = self.cubature_forward_pass(self.ctl, param) # conergence of inner loop xdist_kl = self.gaussians_kldiv(p_xdist.mu, p_xdist.sigma, q_xdist.mu, q_xdist.sigma, self.dm_state, self.nb_steps + 1) while np.any(xdist_kl > 1e-3): param, xvalue, diverge = self.parameter_backward_pass( beta, agcost, q_xdist) if diverge != -1: return np.nan, np.nan p_xdist, _, _ = self.cubature_forward_pass(self.ctl, param) # check convergence of loop xdist_kl = self.gaussians_kldiv(p_xdist.mu, p_xdist.sigma, q_xdist.mu, q_xdist.sigma, self.dm_state, self.nb_steps + 1) # interpolate between distributions q_xdist.mu, q_xdist.sigma = self.interp_gauss_kl( q_xdist.mu, q_xdist.sigma, p_xdist.mu, p_xdist.sigma, 1e-1) # dual expectation dual = quad_expectation(q_xdist.mu[..., 0], q_xdist.sigma[..., 0], xvalue.V[..., 0], xvalue.v[..., 0], xvalue.v0[..., 0]) dual += beta * (np.sum(self.parameter_kldiv(param)) - self.param_kl_bound) # dual gradient grad = np.sum(self.parameter_kldiv(param)) - self.param_kl_bound return -1. * np.array([dual]), -1. * np.array([grad])
def regularized_parameter_dual(self, eta, ctl, last): agcost = self.regularized_parameter_augment_cost(eta, last) # initial adversial xdist. with policy xdist. q_xdist = deepcopy(self.xdist) # first iteration to establish initial conv_kl param, xvalue, diverge = self.parameter_backward_pass( 0., agcost, q_xdist, ctl, eta) if diverge != -1: return np.nan, np.nan p_xdist, _, _ = self.cubature_forward_pass(ctl, param) # convergence of inner loop xdist_kl = self.gaussians_kldiv(p_xdist.mu, p_xdist.sigma, q_xdist.mu, q_xdist.sigma, self.dm_state, self.nb_steps + 1) while np.any(xdist_kl > 1e-3): param, xvalue, diverge = self.parameter_backward_pass( 0., agcost, q_xdist, ctl, eta) if diverge != -1: return np.nan, np.nan p_xdist, _, _ = self.cubature_forward_pass(ctl, param) # check convergence of loop xdist_kl = self.gaussians_kldiv(p_xdist.mu, p_xdist.sigma, q_xdist.mu, q_xdist.sigma, self.dm_state, self.nb_steps + 1) # interpolate between distributions q_xdist.mu, q_xdist.sigma = self.interp_gauss_kl( q_xdist.mu, q_xdist.sigma, p_xdist.mu, p_xdist.sigma, 1e-1) # dual expectation dual = quad_expectation(q_xdist.mu[..., 0], q_xdist.sigma[..., 0], xvalue.V[..., 0], xvalue.v[..., 0], xvalue.v0[..., 0]) dual = np.array([ dual ]) + eta * (np.sum(self.parameter_regularizer_kldiv(last, param)) - self.param_regularizer_kl_bound) grad = np.sum(self.parameter_regularizer_kldiv( last, param)) - self.param_regularizer_kl_bound return -1. * dual, -1. * grad
def regularized_parameter_dual(self, beta, kappa): agcost = self.parameter_augment_cost(beta) q_xdist = deepcopy(self.xdist) # p_xdist = deepcopy(self.xdist) param = MatrixNormalParameters(self.dm_state, self.dm_act, self.nb_steps) p_xdist, _, _ = self.cubature_forward_pass(self.ctl, param) regcost = self.parameter_dual_regularization(p_xdist, q_xdist, kappa) param, xvalue, diverge = self.regularized_parameter_backward_pass(beta, agcost, p_xdist, regcost) if diverge != -1: return np.nan, np.nan # conergence of inner loop xdist_kl = np.sum(self.gaussians_kldiv(p_xdist.mu, p_xdist.sigma, q_xdist.mu, q_xdist.sigma)) while np.any(xdist_kl > 1e-3): regcost = self.parameter_dual_regularization(p_xdist, q_xdist, kappa) param, xvalue, diverge = self.regularized_parameter_backward_pass(beta, agcost, p_xdist, regcost) if diverge != -1: return np.nan, np.nan q_xdist = deepcopy(p_xdist) p_xdist, _, _ = self.cubature_forward_pass(self.ctl, param) # check convergence of loop xdist_kl = self.gaussians_kldiv(p_xdist.mu, p_xdist.sigma, q_xdist.mu, q_xdist.sigma, self.dm_state, self.nb_steps + 1) # dual expectation dual = quad_expectation(q_xdist.mu[..., 0], q_xdist.sigma[..., 0], xvalue.V[..., 0], xvalue.v[..., 0], xvalue.v0[..., 0]) dual += beta * (np.sum(self.parameter_kldiv(param)) - self.param_kl_bound) dual -= kappa * xdist_kl # dual gradient grad = np.sum(self.parameter_kldiv(param)) - self.param_kl_bound return -1. * np.array([dual]), -1. * np.array([grad])
def policy_dual(self, alpha): # augmented cost agcost = self.policy_augment_cost(alpha) # backward pass lgc, xvalue, xuvalue, diverge = self.policy_backward_pass(alpha, agcost) # forward pass xdist, udist, xudist = self.cubature_forward_pass(lgc, self.param) # dual expectation dual = quad_expectation(xdist.mu[..., 0], xdist.sigma[..., 0], xvalue.V[..., 0], xvalue.v[..., 0], xvalue.v0[..., 0]) if self.kl_stepwise: dual = np.array([dual]) - np.sum(alpha * self.policy_kl_bound) grad = self.policy_kldiv(lgc, xdist) - self.policy_kl_bound else: dual = np.array([dual]) - alpha * self.policy_kl_bound grad = np.sum(self.policy_kldiv(lgc, xdist)) - self.policy_kl_bound return -1. * dual, -1. * grad