def step_jha(a, ws, rs): """ Step-based robust empirical estimate of expected cost """ M = rs.shape[0] step_ws = np.cumprod(ws, axis=1) ls = np.sum(rs * step_ws, axis=1) return np.sum(logtrunc(a * ls)) / (M * a)
def __init__(self, env, nb_steps, init_state, activation=None): self.env = env # expose necessary functions self.env_dyn = self.env.unwrapped.dynamics self.env_noise = self.env.unwrapped.sigma self.env_cost = self.env.unwrapped.cost self.env_init = init_state self.ulim = self.env.action_space.high self.dm_state = self.env.observation_space.shape[0] self.dm_act = self.env.action_space.shape[0] self.nb_steps = nb_steps # reference trajectory self.xref = np.zeros((self.dm_state, self.nb_steps + 1)) self.xref[..., 0] = self.env_init[0] self.uref = np.zeros((self.dm_act, self.nb_steps)) self.vfunc = QuadraticStateValue(self.dm_state, self.nb_steps + 1) self.dyn = AnalyticalLinearDynamics(self.env_dyn, self.dm_state, self.dm_act, self.nb_steps) self.ctl = LinearControl(self.dm_state, self.dm_act, self.nb_steps) # activation of cost function in shape of sigmoid if activation is None: self.weighting = np.ones((self.nb_steps + 1, )) elif "mult" and "shift" in activation: t = np.linspace(0, self.nb_steps, self.nb_steps + 1) self.weighting = 1. / (1. + np.exp(- activation['mult'] * (t - activation['shift']))) elif "discount" in activation: self.weighting = np.ones((self.nb_steps + 1,)) gamma = activation["discount"] * np.ones((self.nb_steps, )) self.weighting[1:] = np.cumprod(gamma) else: raise NotImplementedError self.cost = AnalyticalQuadraticCost(self.env_cost, self.dm_state, self.dm_act, self.nb_steps + 1)
def cumprod(x, axis=0): if axis is None: raise NotImplementedError('cumprod is not defined where axis is None') return _np.cumprod(x, axis=axis)
def __init__(self, env, nb_steps, init_state, init_action=None, alphas=np.power(10., np.linspace(0, -3, 11)), lmbda=1., dlmbda=1., min_lmbda=1e-6, max_lmbda=1e6, mult_lmbda=1.6, tolfun=1e-6, tolgrad=1e-4, min_imp=0., reg=1, discounting=1.): self.env = env # expose necessary functions self.env_dyn = self.env.unwrapped.dynamics self.env_cost = self.env.unwrapped.cost self.env_init = init_state self.ulim = self.env.action_space.high self.dm_state = self.env.observation_space.shape[0] self.dm_act = self.env.action_space.shape[0] self.nb_steps = nb_steps # backtracking self.alphas = alphas self.alpha = None self.lmbda = lmbda self.dlmbda = dlmbda self.min_lmbda = min_lmbda self.max_lmbda = max_lmbda self.mult_lmbda = mult_lmbda # regularization type self.reg = reg # minimum relative improvement self.min_imp = min_imp # stopping criterion self.tolfun = tolfun self.tolgrad = tolgrad # reference trajectory self.xref = np.zeros((self.dm_state, self.nb_steps + 1)) self.xref[..., 0] = self.env_init self.uref = np.zeros((self.dm_act, self.nb_steps)) self.vfunc = QuadraticStateValue(self.dm_state, self.nb_steps + 1) self.qfunc = QuadraticStateActionValue(self.dm_state, self.dm_act, self.nb_steps) self.dyn = AnalyticalLinearDynamics(self.env_dyn, self.dm_state, self.dm_act, self.nb_steps) self.ctl = LinearControl(self.dm_state, self.dm_act, self.nb_steps) if init_action is None: self.ctl.kff = 1e-8 * np.random.randn(self.dm_act, self.nb_steps) else: assert init_action.shape[1] == self.nb_steps self.ctl.kff = init_action # Discounting self.weighting = np.ones((self.nb_steps + 1,)) _gamma = discounting * np.ones((self.nb_steps, )) self.weighting[1:] = np.cumprod(_gamma) self.cost = AnalyticalQuadraticCost(self.env_cost, self.dm_state, self.dm_act, self.nb_steps + 1) self.last_return = - np.inf
def __init__(self, env, nb_steps, init_state, activation=None, slew_rate=False, action_penalty=False, alphas=np.power(10., np.linspace(0, -3, 11)), lmbda=1., dlmbda=1., min_lmbda=1e-6, max_lmbda=1e6, mult_lmbda=1.6, tolfun=1e-6, tolgrad=1e-4, min_imp=0., reg=1): self.env = env # expose necessary functions self.env_dyn = self.env.unwrapped.dynamics self.env_cost = self.env.unwrapped.cost self.env_init = init_state self.ulim = self.env.action_space.high self.dm_state = self.env.observation_space.shape[0] self.dm_act = self.env.action_space.shape[0] self.nb_steps = nb_steps # use slew rate penalty or not self.env.unwrapped.slew_rate = slew_rate if action_penalty is not None: self.env.unwrapped.uw = action_penalty * np.ones((self.dm_act, )) # backtracking self.alphas = alphas self.alpha = None self.lmbda = lmbda self.dlmbda = dlmbda self.min_lmbda = min_lmbda self.max_lmbda = max_lmbda self.mult_lmbda = mult_lmbda # regularization type self.reg = reg # minimum relative improvement self.min_imp = min_imp # stopping criterion self.tolfun = tolfun self.tolgrad = tolgrad # reference trajectory self.xref = np.zeros((self.dm_state, self.nb_steps + 1)) self.xref[..., 0] = self.env_init self.uref = np.zeros((self.dm_act, self.nb_steps)) self.vfunc = QuadraticStateValue(self.dm_state, self.nb_steps + 1) self.qfunc = QuadraticStateActionValue(self.dm_state, self.dm_act, self.nb_steps) self.dyn = AnalyticalLinearDynamics(self.env_dyn, self.dm_state, self.dm_act, self.nb_steps) self.ctl = LinearControl(self.dm_state, self.dm_act, self.nb_steps) self.ctl.kff = 1e-4 * np.random.randn(self.dm_act, self.nb_steps) # activation of cost function in shape of sigmoid if activation is None: self.weighting = np.ones((self.nb_steps + 1, )) elif "mult" and "shift" in activation: t = np.linspace(0, self.nb_steps, self.nb_steps + 1) self.weighting = 1. / (1. + np.exp(-activation['mult'] * (t - activation['shift']))) elif "discount" in activation: self.weighting = np.ones((self.nb_steps + 1, )) gamma = activation["discount"] * np.ones((self.nb_steps, )) self.weighting[1:] = np.cumprod(gamma) else: raise NotImplementedError self.cost = AnalyticalQuadraticCost(self.env_cost, self.dm_state, self.dm_act, self.nb_steps + 1) self.last_return = -np.inf
def __init__(self, env, nb_steps, init_state, init_action_sigma=1., policy_kl_bound=0.1, param_kl_bound=100, kl_stepwise=False, activation=None, slew_rate=False, action_penalty=None): # logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG) self.env = env # expose necessary functions self.env_dyn = self.env.unwrapped.dynamics self.env_noise = self.env.unwrapped.sigma self.env_cost = self.env.unwrapped.cost self.env_init = init_state self.ulim = self.env.action_space.high self.dm_state = self.env.observation_space.shape[0] self.dm_act = self.env.action_space.shape[0] self.dm_param = self.dm_state * (self.dm_act + self.dm_state + 1) self.nb_steps = nb_steps # use slew rate penalty or not self.env.unwrapped.slew_rate = slew_rate if action_penalty is not None: self.env.unwrapped.uw = action_penalty * np.ones((self.dm_act, )) self.kl_stepwise = kl_stepwise if self.kl_stepwise: self.policy_kl_bound = policy_kl_bound * np.ones((self.nb_steps, )) self.alpha = 1e8 * np.ones((self.nb_steps, )) else: self.policy_kl_bound = policy_kl_bound * np.ones((1, )) self.alpha = 1e8 * np.ones((1, )) self.param_kl_bound = param_kl_bound self.beta = np.array([1e16]) # create state distribution and initialize first time step self.xdist = Gaussian(self.dm_state, self.nb_steps + 1) self.xdist.mu[..., 0], self.xdist.sigma[..., 0] = self.env_init self.udist = Gaussian(self.dm_act, self.nb_steps) self.xudist = Gaussian(self.dm_state + self.dm_act, self.nb_steps + 1) self.vfunc = QuadraticStateValue(self.dm_state, self.nb_steps + 1) self.qfunc = QuadraticStateActionValue(self.dm_state, self.dm_act, self.nb_steps) self.nominal = MatrixNormalParameters(self.dm_state, self.dm_act, self.nb_steps) # LQG dynamics # _A = np.array([[1.11627793, 0.00000000], # [0.11107100, 1.10517083]]) # _B = np.array([[0.10570721], # [0.00536379]]) # _c = np.array([-1.16277934, -2.16241837]) _A = np.array([[9.99999500e-01, 9.99000500e-03], [-9.99000500e-05, 9.98001499e-01]]) _B = np.array([[4.99666792e-05], [9.99000500e-03]]) _c = np.array([0., 0.]) _params = np.hstack((_A, _B, _c[:, None])) for t in range(self.nb_steps): self.nominal.mu[..., t] = np.reshape(_params, self.dm_param, order='F') self.nominal.sigma[..., t] = 1e-8 * np.eye(self.dm_param) self.param = MatrixNormalParameters(self.dm_state, self.dm_act, self.nb_steps) # We assume process noise over dynamics is known self.noise = np.zeros((self.dm_state, self.dm_state, self.nb_steps)) for t in range(self.nb_steps): self.noise[..., t] = self.env_noise self.ctl = LinearGaussianControl(self.dm_state, self.dm_act, self.nb_steps, init_action_sigma) # activation of cost function in shape of sigmoid if activation is None: self.weighting = np.ones((self.nb_steps + 1, )) elif "mult" and "shift" in activation: t = np.linspace(0, self.nb_steps, self.nb_steps + 1) self.weighting = 1. / (1. + np.exp(-activation['mult'] * (t - activation['shift']))) elif "discount" in activation: self.weighting = np.ones((self.nb_steps + 1, )) gamma = activation["discount"] * np.ones((self.nb_steps, )) self.weighting[1:] = np.cumprod(gamma) else: raise NotImplementedError self.cost = AnalyticalQuadraticCost(self.env_cost, self.dm_state, self.dm_act, self.nb_steps + 1) self.last_return = -np.inf self.data = {}
def __init__(self, env, nb_steps, init_state, init_action_sigma=1., policy_kl_bound=0.1, param_kl_bound=100, kl_stepwise=False, activation=None, slew_rate=False, action_penalty=None, prior=None): # logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG) self.env = env # expose necessary functions self.env_dyn = self.env.unwrapped.dynamics self.env_cost = self.env.unwrapped.cost self.env_init = init_state self.ulim = self.env.action_space.high self.dm_state = self.env.observation_space.shape[0] self.dm_act = self.env.action_space.shape[0] self.dm_param = self.dm_state * (self.dm_act + self.dm_state + 1) self.nb_steps = nb_steps # use slew rate penalty or not self.env.unwrapped.slew_rate = slew_rate if action_penalty is not None: self.env.unwrapped.uw = action_penalty * np.ones((self.dm_act, )) self.kl_stepwise = kl_stepwise if self.kl_stepwise: self.policy_kl_bound = policy_kl_bound * np.ones((self.nb_steps, )) self.alpha = 1e8 * np.ones((self.nb_steps, )) else: self.policy_kl_bound = policy_kl_bound * np.ones((1, )) self.alpha = 1e8 * np.ones((1, )) self.param_kl_bound = param_kl_bound self.beta = np.array([1e8]) # create state distribution and initialize first time step self.xdist = Gaussian(self.dm_state, self.nb_steps + 1) self.xdist.mu[..., 0], self.xdist.sigma[..., 0] = self.env_init self.udist = Gaussian(self.dm_act, self.nb_steps) self.xudist = Gaussian(self.dm_state + self.dm_act, self.nb_steps + 1) self.vfunc = QuadraticStateValue(self.dm_state, self.nb_steps + 1) self.qfunc = QuadraticStateActionValue(self.dm_state, self.dm_act, self.nb_steps) self.nominal = LearnedProbabilisticLinearDynamics( self.dm_state, self.dm_act, self.nb_steps, prior) self.param = MatrixNormalParameters(self.dm_state, self.dm_act, self.nb_steps) self.noise = np.zeros((self.dm_state, self.dm_state, self.nb_steps)) self.ctl = LinearGaussianControl(self.dm_state, self.dm_act, self.nb_steps, init_action_sigma) # activation of cost function in shape of sigmoid if activation is None: self.weighting = np.ones((self.nb_steps + 1, )) elif "mult" and "shift" in activation: t = np.linspace(0, self.nb_steps, self.nb_steps + 1) self.weighting = 1. / (1. + np.exp(-activation['mult'] * (t - activation['shift']))) elif "discount" in activation: self.weighting = np.ones((self.nb_steps + 1, )) gamma = activation["discount"] * np.ones((self.nb_steps, )) self.weighting[1:] = np.cumprod(gamma) else: raise NotImplementedError self.cost = AnalyticalQuadraticCost(self.env_cost, self.dm_state, self.dm_act, self.nb_steps + 1) self.last_return = -np.inf self.data = {}
def __init__(self, env, nb_steps, init_state, init_action_sigma=1., policy_kl_bound=0.1, param_nominal_kl_bound=100., param_regularizer_kl_bound=1., policy_kl_stepwise=False, activation=None, slew_rate=False, action_penalty=None, nominal_variance=1e-8): # logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG) self.env = env # expose necessary functions self.env_dyn = self.env.unwrapped.dynamics self.env_noise = self.env.unwrapped.sigma self.env_cost = self.env.unwrapped.cost self.env_init = init_state self.ulim = self.env.action_space.high self.dm_state = self.env.observation_space.shape[0] self.dm_act = self.env.action_space.shape[0] self.dm_param = self.dm_state * (self.dm_act + self.dm_state + 1) self.nb_steps = nb_steps # use slew rate penalty or not self.env.unwrapped.slew_rate = slew_rate if action_penalty is not None: self.env.unwrapped.uw = action_penalty * np.ones((self.dm_act, )) self.policy_kl_stepwise = policy_kl_stepwise if self.policy_kl_stepwise: self.policy_kl_bound = policy_kl_bound * np.ones((self.nb_steps, )) self.alpha = 1e8 * np.ones((self.nb_steps, )) else: self.policy_kl_bound = policy_kl_bound * np.ones((1, )) self.alpha = 1e8 * np.ones((1, )) self.param_nominal_kl_bound = param_nominal_kl_bound * np.ones((1, )) self.beta = 1e16 * np.ones((1, )) self.param_regularizer_kl_bound = param_regularizer_kl_bound * np.ones( (1, )) self.eta = 1e16 * np.ones((1, )) # create state distribution and initialize first time step self.xdist = Gaussian(self.dm_state, self.nb_steps + 1) self.xdist.mu[..., 0], self.xdist.sigma[..., 0] = self.env_init self.udist = Gaussian(self.dm_act, self.nb_steps) self.xudist = Gaussian(self.dm_state + self.dm_act, self.nb_steps + 1) self.vfunc = QuadraticStateValue(self.dm_state, self.nb_steps + 1) self.qfunc = QuadraticStateActionValue(self.dm_state, self.dm_act, self.nb_steps) # We assume process noise over dynamics is known self.noise = np.zeros((self.dm_state, self.dm_state, self.nb_steps)) for t in range(self.nb_steps): self.noise[..., t] = self.env_noise self.param = MatrixNormalParameters(self.dm_state, self.dm_act, self.nb_steps) self.nominal = MatrixNormalParameters(self.dm_state, self.dm_act, self.nb_steps) # LQG dynamics from autograd import jacobian input = tuple([np.zeros((self.dm_state, )), np.zeros((self.dm_act, ))]) A = jacobian(self.env.unwrapped.dynamics, 0)(*input) B = jacobian(self.env.unwrapped.dynamics, 1)(*input) c = self.env.unwrapped.dynamics(*input) tmp = np.hstack((A, B, c[:, None])) for t in range(self.nb_steps): self.nominal.mu[..., t] = np.reshape(tmp, self.dm_param, order='F') self.nominal.sigma[..., t] = nominal_variance * np.eye(self.dm_param) self.ctl = LinearGaussianControl(self.dm_state, self.dm_act, self.nb_steps, init_action_sigma) # activation of cost function in shape of sigmoid if activation is None: self.weighting = np.ones((self.nb_steps + 1, )) elif "mult" and "shift" in activation: t = np.linspace(0, self.nb_steps, self.nb_steps + 1) self.weighting = 1. / (1. + np.exp(-activation['mult'] * (t - activation['shift']))) elif "discount" in activation: self.weighting = np.ones((self.nb_steps + 1, )) gamma = activation["discount"] * np.ones((self.nb_steps, )) self.weighting[1:] = np.cumprod(gamma) else: raise NotImplementedError self.cost = AnalyticalQuadraticCost(self.env_cost, self.dm_state, self.dm_act, self.nb_steps + 1) self.data = {}
def __init__(self, env, nb_steps, init_state, init_action_sigma=1., kl_bound=0.1, kl_adaptive=False, kl_stepwise=False, activation=None, slew_rate=False, action_penalty=None): self.env = env # expose necessary functions self.env_dyn = self.env.unwrapped.dynamics self.env_noise = self.env.unwrapped.noise self.env_cost = self.env.unwrapped.cost self.env_init = init_state self.ulim = self.env.action_space.high self.dm_state = self.env.observation_space.shape[0] self.dm_act = self.env.action_space.shape[0] self.nb_steps = nb_steps # use slew rate penalty or not self.env.unwrapped.slew_rate = slew_rate if action_penalty is not None: self.env.unwrapped.uw = action_penalty * np.ones((self.dm_act, )) self.kl_stepwise = kl_stepwise if self.kl_stepwise: self.kl_base = kl_bound * np.ones((self.nb_steps, )) self.kl_bound = kl_bound * np.ones((self.nb_steps, )) self.alpha = 1e8 * np.ones((self.nb_steps, )) else: self.kl_base = kl_bound * np.ones((1, )) self.kl_bound = kl_bound * np.ones((1, )) self.alpha = 1e8 * np.ones((1, )) # kl mult. self.kl_adaptive = kl_adaptive self.kl_mult = 1. self.kl_mult_min = 0.1 self.kl_mult_max = 5.0 # create state distribution and initialize first time step self.xdist = Gaussian(self.dm_state, self.nb_steps + 1) self.xdist.mu[..., 0], self.xdist.sigma[..., 0] = self.env_init self.udist = Gaussian(self.dm_act, self.nb_steps) self.xudist = Gaussian(self.dm_state + self.dm_act, self.nb_steps + 1) self.vfunc = QuadraticStateValue(self.dm_state, self.nb_steps + 1) self.qfunc = QuadraticStateActionValue(self.dm_state, self.dm_act, self.nb_steps) self.dyn = AnalyticalLinearGaussianDynamics(self.env_dyn, self.env_noise, self.dm_state, self.dm_act, self.nb_steps) self.ctl = LinearGaussianControl(self.dm_state, self.dm_act, self.nb_steps, init_action_sigma) self.ctl.kff = 1e-4 * np.random.randn(self.dm_act, self.nb_steps) # activation of cost function in shape of sigmoid if activation is None: self.weighting = np.ones((self.nb_steps + 1, )) elif "mult" and "shift" in activation: t = np.linspace(0, self.nb_steps, self.nb_steps + 1) self.weighting = 1. / (1. + np.exp(-activation['mult'] * (t - activation['shift']))) elif "discount" in activation: self.weighting = np.ones((self.nb_steps + 1, )) gamma = activation["discount"] * np.ones((self.nb_steps, )) self.weighting[1:] = np.cumprod(gamma) else: raise NotImplementedError self.cost = AnalyticalQuadraticCost(self.env_cost, self.dm_state, self.dm_act, self.nb_steps + 1) self.last_return = -np.inf