def __init__(self, env, nb_steps, kl_bound, init_ctl_sigma, activation=range(-1, 0)): self.env = env # expose necessary functions self.env_dyn = self.env.unwrapped.dynamics self.env_noise = self.env.unwrapped.noise self.env_cost = self.env.unwrapped.cost self.env_init = self.env.unwrapped.init self.ulim = self.env.action_space.high self.nb_xdim = self.env.observation_space.shape[0] self.nb_udim = self.env.action_space.shape[0] self.nb_steps = nb_steps # total kl over traj. self.kl_base = kl_bound self.kl_bound = kl_bound # kl mult. self.kl_mult = 1. self.kl_mult_min = 0.1 self.kl_mult_max = 5.0 self.alpha = np.array([-100.]) # create state distribution and initialize first time step self.xdist = Gaussian(self.nb_xdim, self.nb_steps + 1) self.xdist.mu[..., 0], self.xdist.sigma[..., 0] = self.env_init() self.udist = Gaussian(self.nb_udim, self.nb_steps) self.xudist = Gaussian(self.nb_xdim + self.nb_udim, self.nb_steps + 1) self.vfunc = QuadraticStateValue(self.nb_xdim, self.nb_steps + 1) self.qfunc = QuadraticStateActionValue(self.nb_xdim, self.nb_udim, self.nb_steps) self.dyn = LearnedLinearGaussianDynamics(self.nb_xdim, self.nb_udim, self.nb_steps) self.ctl = LinearGaussianControl(self.nb_xdim, self.nb_udim, self.nb_steps, init_ctl_sigma) # activation of cost function self.activation = np.zeros((self.nb_steps + 1, ), dtype=np.int64) self.activation[-1] = 1. # last step always in self.activation[activation] = 1. self.cost = AnalyticalQuadraticCost(self.env_cost, self.nb_xdim, self.nb_udim, self.nb_steps + 1) self.last_return = -np.inf self.data = {}
def forward_pass(self, lgc): xdist = Gaussian(self.dm_state, self.nb_steps + 1) udist = Gaussian(self.dm_act, self.nb_steps) xudist = Gaussian(self.dm_state + self.dm_act, self.nb_steps + 1) xdist.mu, xdist.sigma,\ udist.mu, udist.sigma,\ xudist.mu, xudist.sigma = forward_pass(self.xdist.mu[..., 0], self.xdist.sigma[..., 0], self.dyn.A, self.dyn.B, self.dyn.c, self.dyn.sigma, lgc.K, lgc.kff, lgc.sigma, self.dm_state, self.dm_act, self.nb_steps) return xdist, udist, xudist
def __init__(self, env, nb_steps, init_state, init_action_sigma=1., kl_bound=0.1, kl_adaptive=False): self.env = env # expose necessary functions self.env_dyn = self.env.unwrapped.dynamics self.env_noise = self.env.unwrapped.noise self.env_cost = self.env.unwrapped.cost self.env_init = init_state self.ulim = self.env.action_space.high self.dm_state = self.env.observation_space.shape[0] self.dm_act = self.env.action_space.shape[0] self.nb_steps = nb_steps # total kl over traj. self.kl_base = kl_bound self.kl_bound = kl_bound # kl mult. self.kl_adaptive = kl_adaptive self.kl_mult = 1. self.kl_mult_min = 0.1 self.kl_mult_max = 5.0 self.alpha = np.array([-1e4]) # create state distribution and initialize first time step self.xdist = Gaussian(self.dm_state, self.nb_steps + 1) self.xdist.mu[..., 0], self.xdist.sigma[..., 0] = self.env_init self.udist = Gaussian(self.dm_act, self.nb_steps) self.xudist = Gaussian(self.dm_state + self.dm_act, self.nb_steps + 1) self.vfunc = QuadraticStateValue(self.dm_state, self.nb_steps + 1) self.qfunc = QuadraticStateActionValue(self.dm_state, self.dm_act, self.nb_steps) self.dyn = AnalyticalLinearGaussianDynamics(self.env_dyn, self.env_noise, self.dm_state, self.dm_act, self.nb_steps) self.ctl = LinearGaussianControl(self.dm_state, self.dm_act, self.nb_steps, init_action_sigma) self.ctl.kff = 1e-2 * np.random.randn(self.dm_act, self.nb_steps) self.cost = AnalyticalQuadraticCost(self.env_cost, self.dm_state, self.dm_act, self.nb_steps + 1) self.last_return = - np.inf
def forward_pass(self, lgc): """ Forward pass on linearized system :param lgc: :return: """ xdist = Gaussian(self.nb_xdim, self.nb_steps + 1) udist = Gaussian(self.nb_udim, self.nb_steps) xudist = Gaussian(self.nb_xdim + self.nb_udim, self.nb_steps + 1) xdist.mu, xdist.sigma,\ udist.mu, udist.sigma,\ xudist.mu, xudist.sigma = forward_pass(self.xdist.mu[..., 0], self.xdist.sigma[..., 0], self.dyn.A, self.dyn.B, self.dyn.c, self.dyn.sigma, lgc.K, lgc.kff, lgc.sigma, self.nb_xdim, self.nb_udim, self.nb_steps) return xdist, udist, xudist
def extended_kalman(self, lgc): """ Forward pass on actual dynamics :param lgc: :return: """ xdist = Gaussian(self.nb_xdim, self.nb_steps + 1) udist = Gaussian(self.nb_udim, self.nb_steps) cost = np.zeros((self.nb_steps + 1, )) xdist.mu[..., 0], xdist.sigma[..., 0] = self.dyn.evali() for t in range(self.nb_steps): udist.mu[..., t], udist.sigma[..., t] = lgc.forward(xdist, t) cost[..., t] = self.cost.evalf(xdist.mu[..., t], udist.mu[..., t], self.activation[t]) xdist.mu[..., t + 1], xdist.sigma[..., t + 1] = self.dyn.forward( xdist, udist, lgc, t) cost[..., -1] = self.cost.evalf(xdist.mu[..., -1], np.zeros((self.nb_udim, )), self.activation[-1]) return xdist, udist, cost
def __init__(self, env, nb_steps, kl_bound, init_ctl_sigma, activation=None): self.env = env # expose necessary functions self.env_dyn = self.env.unwrapped.dynamics self.env_noise = self.env.unwrapped.noise self.env_cost = self.env.unwrapped.cost self.env_init = self.env.unwrapped.init self.ulim = self.env.action_space.high self.dm_state = self.env.observation_space.shape[0] self.dm_act = self.env.action_space.shape[0] self.nb_steps = nb_steps # total kl over traj. self.kl_base = kl_bound self.kl_bound = kl_bound # kl mult. self.kl_mult = 1. self.kl_mult_min = 0.1 self.kl_mult_max = 5.0 self.alpha = np.array([-1e4]) # create state distribution and initialize first time step self.xdist = Gaussian(self.dm_state, self.nb_steps + 1) self.xdist.mu[..., 0], self.xdist.sigma[..., 0] = self.env_init() self.udist = Gaussian(self.dm_act, self.nb_steps) self.xudist = Gaussian(self.dm_state + self.dm_act, self.nb_steps + 1) self.vfunc = QuadraticStateValue(self.dm_state, self.nb_steps + 1) self.qfunc = QuadraticStateActionValue(self.dm_state, self.dm_act, self.nb_steps) self.dyn = LearnedLinearGaussianDynamics(self.dm_state, self.dm_act, self.nb_steps) self.ctl = LinearGaussianControl(self.dm_state, self.dm_act, self.nb_steps, init_ctl_sigma) # activation of cost function in shape of sigmoid if activation is None: self.weighting = np.ones((self.nb_steps + 1, )) else: _t = np.linspace(0, self.nb_steps, self.nb_steps + 1) self.weighting = 1. / (1. + np.exp(-activation['mult'] * (_t - activation['shift']))) self.cost = AnalyticalQuadraticCost(self.env_cost, self.dm_state, self.dm_act, self.nb_steps + 1) self.last_return = -np.inf self.data = {}
def __init__(self, model, reward, horizon, kl_bound, u_lim, init_ctl_sigma, init_noise, activation='last'): self.env = model self.env_dyn = self.env.dynamics self.env_noise = lambda x, u: self.env.sigV self.env_cost = reward self.env_init = self.env.init self.ulim = u_lim self.nb_xdim = self.env.dim_x self.nb_udim = self.env.dim_u self.nb_steps = horizon # total kl over traj. # total kl over traj. self.kl_base = kl_bound self.kl_bound = kl_bound # kl mult. self.kl_mult = 1. self.kl_mult_min = 0.1 self.kl_mult_max = 5.0 self.alpha = np.array([-100.]) # create state distribution and initialize first time step self.xdist = Gaussian(self.nb_xdim, self.nb_steps + 1) self.xdist.mu[..., 0], self.xdist.sigma[..., 0] = self.env_init() self.udist = Gaussian(self.nb_udim, self.nb_steps) self.xudist = Gaussian(self.nb_xdim + self.nb_udim, self.nb_steps + 1) self.vfunc = QuadraticStateValue(self.nb_xdim, self.nb_steps + 1) self.qfunc = QuadraticStateActionValue(self.nb_xdim, self.nb_udim, self.nb_steps) self.dyn = AnalyticalLinearGaussianDynamics(self.env_init, self.env_dyn, self.env_noise, self.nb_xdim, self.nb_udim, self.nb_steps) self.ctl = LinearGaussianControl(self.nb_xdim, self.nb_udim, self.nb_steps, init_ctl_sigma) self.ctl.kff = init_noise * np.random.randn(self.nb_udim, self.nb_steps) if activation == 'all': self.activation = np.ones((self.nb_steps + 1, ), dtype=np.int64) else: self.activation = np.zeros((self.nb_steps + 1, ), dtype=np.int64) self.activation[-1] = 1 self.cost = AnalyticalQuadraticCost(self.env_cost, self.nb_xdim, self.nb_udim, self.nb_steps + 1)
def __init__(self, env, nb_steps, init_state, init_action_sigma=1., kl_bound=0.1, kl_adaptive=False, kl_stepwise=False, activation=None, slew_rate=False, action_penalty=None): self.env = env # expose necessary functions self.env_dyn = self.env.unwrapped.dynamics self.env_noise = self.env.unwrapped.noise self.env_cost = self.env.unwrapped.cost self.env_init = init_state self.ulim = self.env.action_space.high self.dm_state = self.env.observation_space.shape[0] self.dm_act = self.env.action_space.shape[0] self.nb_steps = nb_steps # use slew rate penalty or not self.env.unwrapped.slew_rate = slew_rate if action_penalty is not None: self.env.unwrapped.uw = action_penalty * np.ones((self.dm_act, )) self.kl_stepwise = kl_stepwise if self.kl_stepwise: self.kl_base = kl_bound * np.ones((self.nb_steps, )) self.kl_bound = kl_bound * np.ones((self.nb_steps, )) self.alpha = 1e8 * np.ones((self.nb_steps, )) else: self.kl_base = kl_bound * np.ones((1, )) self.kl_bound = kl_bound * np.ones((1, )) self.alpha = 1e8 * np.ones((1, )) # kl mult. self.kl_adaptive = kl_adaptive self.kl_mult = 1. self.kl_mult_min = 0.1 self.kl_mult_max = 5.0 # create state distribution and initialize first time step self.xdist = Gaussian(self.dm_state, self.nb_steps + 1) self.xdist.mu[..., 0], self.xdist.sigma[..., 0] = self.env_init self.udist = Gaussian(self.dm_act, self.nb_steps) self.xudist = Gaussian(self.dm_state + self.dm_act, self.nb_steps + 1) self.vfunc = QuadraticStateValue(self.dm_state, self.nb_steps + 1) self.qfunc = QuadraticStateActionValue(self.dm_state, self.dm_act, self.nb_steps) self.dyn = AnalyticalLinearGaussianDynamics(self.env_dyn, self.env_noise, self.dm_state, self.dm_act, self.nb_steps) self.ctl = LinearGaussianControl(self.dm_state, self.dm_act, self.nb_steps, init_action_sigma) self.ctl.kff = 1e-4 * np.random.randn(self.dm_act, self.nb_steps) # activation of cost function in shape of sigmoid if activation is None: self.weighting = np.ones((self.nb_steps + 1, )) elif "mult" and "shift" in activation: t = np.linspace(0, self.nb_steps, self.nb_steps + 1) self.weighting = 1. / (1. + np.exp(-activation['mult'] * (t - activation['shift']))) elif "discount" in activation: self.weighting = np.ones((self.nb_steps + 1, )) gamma = activation["discount"] * np.ones((self.nb_steps, )) self.weighting[1:] = np.cumprod(gamma) else: raise NotImplementedError self.cost = AnalyticalQuadraticCost(self.env_cost, self.dm_state, self.dm_act, self.nb_steps + 1) self.last_return = -np.inf