def __init__(self, env, nb_steps, init_state, init_action_sigma=1., policy_kl_bound=0.1, param_kl_bound=100, kl_stepwise=False, activation=None, slew_rate=False, action_penalty=None): # logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG) self.env = env # expose necessary functions self.env_dyn = self.env.unwrapped.dynamics self.env_noise = self.env.unwrapped.sigma self.env_cost = self.env.unwrapped.cost self.env_init = init_state self.ulim = self.env.action_space.high self.dm_state = self.env.observation_space.shape[0] self.dm_act = self.env.action_space.shape[0] self.dm_param = self.dm_state * (self.dm_act + self.dm_state + 1) self.nb_steps = nb_steps # use slew rate penalty or not self.env.unwrapped.slew_rate = slew_rate if action_penalty is not None: self.env.unwrapped.uw = action_penalty * np.ones((self.dm_act, )) self.kl_stepwise = kl_stepwise if self.kl_stepwise: self.policy_kl_bound = policy_kl_bound * np.ones((self.nb_steps, )) self.alpha = 1e8 * np.ones((self.nb_steps, )) else: self.policy_kl_bound = policy_kl_bound * np.ones((1, )) self.alpha = 1e8 * np.ones((1, )) self.param_kl_bound = param_kl_bound self.beta = np.array([1e16]) # create state distribution and initialize first time step self.xdist = Gaussian(self.dm_state, self.nb_steps + 1) self.xdist.mu[..., 0], self.xdist.sigma[..., 0] = self.env_init self.udist = Gaussian(self.dm_act, self.nb_steps) self.xudist = Gaussian(self.dm_state + self.dm_act, self.nb_steps + 1) self.vfunc = QuadraticStateValue(self.dm_state, self.nb_steps + 1) self.qfunc = QuadraticStateActionValue(self.dm_state, self.dm_act, self.nb_steps) self.nominal = MatrixNormalParameters(self.dm_state, self.dm_act, self.nb_steps) # LQG dynamics # _A = np.array([[1.11627793, 0.00000000], # [0.11107100, 1.10517083]]) # _B = np.array([[0.10570721], # [0.00536379]]) # _c = np.array([-1.16277934, -2.16241837]) _A = np.array([[9.99999500e-01, 9.99000500e-03], [-9.99000500e-05, 9.98001499e-01]]) _B = np.array([[4.99666792e-05], [9.99000500e-03]]) _c = np.array([0., 0.]) _params = np.hstack((_A, _B, _c[:, None])) for t in range(self.nb_steps): self.nominal.mu[..., t] = np.reshape(_params, self.dm_param, order='F') self.nominal.sigma[..., t] = 1e-8 * np.eye(self.dm_param) self.param = MatrixNormalParameters(self.dm_state, self.dm_act, self.nb_steps) # We assume process noise over dynamics is known self.noise = np.zeros((self.dm_state, self.dm_state, self.nb_steps)) for t in range(self.nb_steps): self.noise[..., t] = self.env_noise self.ctl = LinearGaussianControl(self.dm_state, self.dm_act, self.nb_steps, init_action_sigma) # activation of cost function in shape of sigmoid if activation is None: self.weighting = np.ones((self.nb_steps + 1, )) elif "mult" and "shift" in activation: t = np.linspace(0, self.nb_steps, self.nb_steps + 1) self.weighting = 1. / (1. + np.exp(-activation['mult'] * (t - activation['shift']))) elif "discount" in activation: self.weighting = np.ones((self.nb_steps + 1, )) gamma = activation["discount"] * np.ones((self.nb_steps, )) self.weighting[1:] = np.cumprod(gamma) else: raise NotImplementedError self.cost = AnalyticalQuadraticCost(self.env_cost, self.dm_state, self.dm_act, self.nb_steps + 1) self.last_return = -np.inf self.data = {}
def __init__(self, env, nb_steps, init_state, init_action_sigma=1., policy_kl_bound=0.1, param_kl_bound=100, kl_stepwise=False, activation=None, slew_rate=False, action_penalty=None, prior=None): # logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG) self.env = env # expose necessary functions self.env_dyn = self.env.unwrapped.dynamics self.env_cost = self.env.unwrapped.cost self.env_init = init_state self.ulim = self.env.action_space.high self.dm_state = self.env.observation_space.shape[0] self.dm_act = self.env.action_space.shape[0] self.dm_param = self.dm_state * (self.dm_act + self.dm_state + 1) self.nb_steps = nb_steps # use slew rate penalty or not self.env.unwrapped.slew_rate = slew_rate if action_penalty is not None: self.env.unwrapped.uw = action_penalty * np.ones((self.dm_act, )) self.kl_stepwise = kl_stepwise if self.kl_stepwise: self.policy_kl_bound = policy_kl_bound * np.ones((self.nb_steps, )) self.alpha = 1e8 * np.ones((self.nb_steps, )) else: self.policy_kl_bound = policy_kl_bound * np.ones((1, )) self.alpha = 1e8 * np.ones((1, )) self.param_kl_bound = param_kl_bound self.beta = np.array([1e8]) # create state distribution and initialize first time step self.xdist = Gaussian(self.dm_state, self.nb_steps + 1) self.xdist.mu[..., 0], self.xdist.sigma[..., 0] = self.env_init self.udist = Gaussian(self.dm_act, self.nb_steps) self.xudist = Gaussian(self.dm_state + self.dm_act, self.nb_steps + 1) self.vfunc = QuadraticStateValue(self.dm_state, self.nb_steps + 1) self.qfunc = QuadraticStateActionValue(self.dm_state, self.dm_act, self.nb_steps) self.nominal = LearnedProbabilisticLinearDynamics( self.dm_state, self.dm_act, self.nb_steps, prior) self.param = MatrixNormalParameters(self.dm_state, self.dm_act, self.nb_steps) self.noise = np.zeros((self.dm_state, self.dm_state, self.nb_steps)) self.ctl = LinearGaussianControl(self.dm_state, self.dm_act, self.nb_steps, init_action_sigma) # activation of cost function in shape of sigmoid if activation is None: self.weighting = np.ones((self.nb_steps + 1, )) elif "mult" and "shift" in activation: t = np.linspace(0, self.nb_steps, self.nb_steps + 1) self.weighting = 1. / (1. + np.exp(-activation['mult'] * (t - activation['shift']))) elif "discount" in activation: self.weighting = np.ones((self.nb_steps + 1, )) gamma = activation["discount"] * np.ones((self.nb_steps, )) self.weighting[1:] = np.cumprod(gamma) else: raise NotImplementedError self.cost = AnalyticalQuadraticCost(self.env_cost, self.dm_state, self.dm_act, self.nb_steps + 1) self.last_return = -np.inf self.data = {}
def cubature_forward_pass(self, lgc, param): xdist = Gaussian(self.dm_state, self.nb_steps + 1) udist = Gaussian(self.dm_act, self.nb_steps) xudist = Gaussian(self.dm_state + self.dm_act, self.nb_steps + 1) xdist.mu, xdist.sigma,\ udist.mu, udist.sigma,\ xudist.mu, xudist.sigma = cubature_forward_pass(self.xdist.mu[..., 0], self.xdist.sigma[..., 0], param.mu, param.sigma, self.noise, lgc.K, lgc.kff, lgc.sigma, self.dm_state, self.dm_act, self.nb_steps) # dm_augmented = self.dm_state + self.dm_act + 1 + self.dm_state # chol_sigma_augmented = 1e-8 * np.eye(dm_augmented) # # input_cubature_points = np.zeros((dm_augmented, 2 * dm_augmented)) # output_cubature_points = np.zeros((self.dm_state, 2 * dm_augmented)) # # xdist.mu[..., 0] = self.xdist.mu[..., 0] # xdist.sigma[..., 0] = self.xdist.sigma[..., 0] # for t in range(self.nb_steps): # At = param.mu[0: self.dm_state * self.dm_state, t] # Bt = param.mu[self.dm_state * self.dm_state: self.dm_state * self.dm_state + self.dm_state * self.dm_act, t] # ct = param.mu[self.dm_state * self.dm_state + self.dm_state * self.dm_act: # self.dm_state * self.dm_state + self.dm_state * self.dm_act + self.dm_state, t] # # At = np.reshape(At, (self.dm_state, self.dm_state), order='F') # Bt = np.reshape(Bt, (self.dm_state, self.dm_act), order='F') # ct = np.reshape(ct, (self.dm_state, 1), order='F') # # udist.mu[..., t] = lgc.K[..., t] @ xdist.mu[..., t] + lgc.kff[..., t] # udist.sigma[..., t] = lgc.sigma[..., t] + lgc.K[..., t] @ xdist.sigma[..., t] @ lgc.K[..., t].T # udist.sigma[..., t] = 0.5 * (udist.sigma[..., t] + udist.sigma[..., t].T) # udist.sigma[..., t] += 1e-8 * np.eye(self.dm_act) # # xudist.mu[..., t] = np.hstack((xdist.mu[..., t], udist.mu[..., t])) # xudist.sigma[..., t] = np.block([[xdist.sigma[..., t], xdist.sigma[..., t] @ lgc.K[..., t].T], # [lgc.K[..., t] @ xdist.sigma[..., t], udist.sigma[..., t]]]) # xudist.sigma[..., t] += 1e-8 * np.eye(self.dm_state + self.dm_act) # # mu_augmented = np.hstack((xudist.mu[..., t], 1., np.zeros((self.dm_state, )))) # chol_sigma_augmented[0: self.dm_state + self.dm_act, # 0: self.dm_state + self.dm_act] = np.linalg.cholesky(xudist.sigma[..., t]) # chol_sigma_augmented[-self.dm_state:, -self.dm_state:] = np.eye(self.dm_state) # # input_cubature_points = np.hstack((chol_sigma_augmented, - chol_sigma_augmented)) # input_cubature_points *= np.sqrt(dm_augmented) # input_cubature_points += mu_augmented[:, None] # # for k in range(2 * dm_augmented): # vec_xu = input_cubature_points[:, k] # mat_xu = np.kron(vec_xu[:self.dm_state + self.dm_act + 1], np.eye(self.dm_state)) # covar = self.noise[..., t] + mat_xu @ param.sigma[..., t] @ mat_xu.T # covar = 0.5 * (covar + covar.T) # # chol_covar = np.linalg.cholesky(covar) # output_cubature_points[:, k] = np.hstack((At, Bt, ct, chol_covar)) @ vec_xu # # xdist.mu[..., t + 1] = np.mean(output_cubature_points, axis=-1) # # xdist.sigma[..., t + 1] = np.zeros((self.dm_state, self.dm_state)) # for k in range(2 * dm_augmented): # diff = output_cubature_points[:, k] - xdist.mu[..., t + 1] # xdist.sigma[..., t + 1] += diff[:, None] @ diff[:, None].T # # xdist.sigma[..., t + 1] /= (2 * dm_augmented) # xdist.sigma[..., t + 1] = 0.5 * (xdist.sigma[..., t + 1] + xdist.sigma[..., t + 1].T) # # if t == self.nb_steps - 1: # xudist.mu[..., t + 1] = np.hstack((xdist.mu[..., t + 1], np.zeros((self.dm_act, )))) # xudist.sigma[:self.dm_state, :self.dm_state, t + 1] = xdist.sigma[..., t + 1] return xdist, udist, xudist
def __init__(self, env, nb_steps, init_state, init_action_sigma=1., policy_kl_bound=0.1, param_nominal_kl_bound=100., param_regularizer_kl_bound=1., policy_kl_stepwise=False, activation=None, slew_rate=False, action_penalty=None, nominal_variance=1e-8): # logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG) self.env = env # expose necessary functions self.env_dyn = self.env.unwrapped.dynamics self.env_noise = self.env.unwrapped.sigma self.env_cost = self.env.unwrapped.cost self.env_init = init_state self.ulim = self.env.action_space.high self.dm_state = self.env.observation_space.shape[0] self.dm_act = self.env.action_space.shape[0] self.dm_param = self.dm_state * (self.dm_act + self.dm_state + 1) self.nb_steps = nb_steps # use slew rate penalty or not self.env.unwrapped.slew_rate = slew_rate if action_penalty is not None: self.env.unwrapped.uw = action_penalty * np.ones((self.dm_act, )) self.policy_kl_stepwise = policy_kl_stepwise if self.policy_kl_stepwise: self.policy_kl_bound = policy_kl_bound * np.ones((self.nb_steps, )) self.alpha = 1e8 * np.ones((self.nb_steps, )) else: self.policy_kl_bound = policy_kl_bound * np.ones((1, )) self.alpha = 1e8 * np.ones((1, )) self.param_nominal_kl_bound = param_nominal_kl_bound * np.ones((1, )) self.beta = 1e16 * np.ones((1, )) self.param_regularizer_kl_bound = param_regularizer_kl_bound * np.ones( (1, )) self.eta = 1e16 * np.ones((1, )) # create state distribution and initialize first time step self.xdist = Gaussian(self.dm_state, self.nb_steps + 1) self.xdist.mu[..., 0], self.xdist.sigma[..., 0] = self.env_init self.udist = Gaussian(self.dm_act, self.nb_steps) self.xudist = Gaussian(self.dm_state + self.dm_act, self.nb_steps + 1) self.vfunc = QuadraticStateValue(self.dm_state, self.nb_steps + 1) self.qfunc = QuadraticStateActionValue(self.dm_state, self.dm_act, self.nb_steps) # We assume process noise over dynamics is known self.noise = np.zeros((self.dm_state, self.dm_state, self.nb_steps)) for t in range(self.nb_steps): self.noise[..., t] = self.env_noise self.param = MatrixNormalParameters(self.dm_state, self.dm_act, self.nb_steps) self.nominal = MatrixNormalParameters(self.dm_state, self.dm_act, self.nb_steps) # LQG dynamics from autograd import jacobian input = tuple([np.zeros((self.dm_state, )), np.zeros((self.dm_act, ))]) A = jacobian(self.env.unwrapped.dynamics, 0)(*input) B = jacobian(self.env.unwrapped.dynamics, 1)(*input) c = self.env.unwrapped.dynamics(*input) tmp = np.hstack((A, B, c[:, None])) for t in range(self.nb_steps): self.nominal.mu[..., t] = np.reshape(tmp, self.dm_param, order='F') self.nominal.sigma[..., t] = nominal_variance * np.eye(self.dm_param) self.ctl = LinearGaussianControl(self.dm_state, self.dm_act, self.nb_steps, init_action_sigma) # activation of cost function in shape of sigmoid if activation is None: self.weighting = np.ones((self.nb_steps + 1, )) elif "mult" and "shift" in activation: t = np.linspace(0, self.nb_steps, self.nb_steps + 1) self.weighting = 1. / (1. + np.exp(-activation['mult'] * (t - activation['shift']))) elif "discount" in activation: self.weighting = np.ones((self.nb_steps + 1, )) gamma = activation["discount"] * np.ones((self.nb_steps, )) self.weighting[1:] = np.cumprod(gamma) else: raise NotImplementedError self.cost = AnalyticalQuadraticCost(self.env_cost, self.dm_state, self.dm_act, self.nb_steps + 1) self.data = {}