def __init__(self, env, gamma_optimistic, gamma_cautious, lambda_cautious, x_seed, y_seed, gp_params=None, keep_seed_in_data=True): """ Initializer :param env: the environment :param gamma_optimistic: the gamma parameter for Q_optimistic :param gamma_cautious: the gamma parameter for Q_cautious :param lambda_cautious: the lambda parameter for Q_cautious :param x_seed: the seed input of the GP :param y_seed: the seed output of the GP :param gp_params: the parameters of the GP. See edge.models.inference.MaternGP for more information :param keep_seed_in_data: whether to keep the seed data in the GP dataset. Should be True, otherwise GPyTorch fails. """ safety_model = MaternSafety(env, gamma_optimistic, x_seed, y_seed, gp_params) super(SafetyLearner, self).__init__(env, safety_model) self.safety_model = safety_model self.active_sampling_policy = SafetyActiveSampling( self.env.stateaction_space) self.safety_maximization_policy = SafetyMaximization( self.env.stateaction_space) self.gamma_cautious = gamma_cautious self.lambda_cautious = lambda_cautious self.keep_seed_in_data = keep_seed_in_data
def __init__(self, env, s_gp_params, gamma_cautious, lambda_cautious, gamma_optimistic, checks_safety=True, learn_safety=True, is_free_from_safety=False, always_update_safety=False, safety_model=None, *models): self.gamma_cautious_s, self.gamma_cautious_e = gamma_cautious self.lambda_cautious_s, self.lambda_cautious_e = lambda_cautious self.gamma_optimistic_s, self.gamma_optimistic_e = gamma_optimistic self.gamma_cautious = self.gamma_cautious_s self.lambda_cautious = self.lambda_cautious_s if safety_model is not None: self.safety_model = safety_model else: x_seed = s_gp_params.pop('train_x') y_seed = s_gp_params.pop('train_y') self.safety_model = MaternSafety( env, gamma_measure=self.gamma_optimistic_s, x_seed=x_seed, y_seed=y_seed, gp_params=s_gp_params ) super().__init__(env, self.safety_model, *models) self.safety_learning_policy = SafetyInformationMaximization( env.stateaction_space ) self.safe_projection_policy = SafeProjectionPolicy( env.stateaction_space ) self.safety_maximization_policy = SafetyMaximization( self.env.stateaction_space ) self.active_sampling_policy = SafetyActiveSampling( self.env.stateaction_space ) self.last_controller_action = None self.safety_update = None self.checks_safety = checks_safety self.followed_controller = None self.always_update_safety = always_update_safety self.violated_constraint = None self.is_free_from_safety = is_free_from_safety self.learn_safety = learn_safety
def load(env, mpath, gamma_cautious, lambda_cautious, **safety_options): safety_model = MaternSafety.load( mpath, env, gamma_measure=None, x_seed=None, y_seed=None ) safety_options["safety_model"] = safety_model gamma_optimistic = (safety_model.gamma_measure, safety_model.gamma_measure) agent = RandomSafetyLearner(env, {}, gamma_cautious, lambda_cautious, gamma_optimistic, safety_options) return agent
def __init__(self, env, greed, step_size, discount_rate, q_x_seed, q_y_seed, gamma_optimistic, gamma_hard, lambda_hard, gamma_soft, s_x_seed, s_y_seed, q_gp_params=None, s_gp_params=None, keep_seed_in_data=True): """ Initializer :param env: the environment :param greed: the epsilon parameter of the ConstrainedEpsilonGreedy policy :param q_step_size: the step size in the Q-Learning update :param discount_rate: the discount rate :param q_x_seed: the seed input of the GP for the Q-Values model :param q_y_seed: the seed output of the GP for the Q-Values model :param gamma_optimistic: the gamma parameter for Q_optimistic :param gamma_hard: the gamma parameter for Q_hard, the set where Q-Learning is constrained (~ Q_cautious) :param lambda_hard: the lambda parameter for Q_hard AND Q_soft :param gamma_soft: the gamma parameter for Q_soft, the set outside of which the safety measure is updated :param s_x_seed: the seed input of the GP for the safety model :param s_y_seed: the seed output of the GP for the safety model :param q_gp_params: the parameters defining the GP for the Q-Values model. See edge.models.inference.MaternGP for more information :param q_gp_params: the parameters defining the GP for the safety model. See edge.models.inference.MaternGP for more information :param keep_seed_in_data: whether to keep the seed data in the GPs datasets. Should be True, otherwise GPyTorch fails. """ Q_model = GPQLearning(env.stateaction_space, step_size, discount_rate, x_seed=q_x_seed, y_seed=q_y_seed, gp_params=q_gp_params) safety_model = MaternSafety(env.stateaction_space, gamma_optimistic, x_seed=s_x_seed, y_seed=s_y_seed, gp_params=s_gp_params) super(SoftHardLearner, self).__init__(env, Q_model, safety_model) self.Q_model = Q_model self.safety_model = safety_model self.lambda_hard = lambda_hard self.gamma_hard = gamma_hard self.gamma_soft = gamma_soft self._gamma_optimistic = gamma_optimistic self.constrained_value_policy = ConstrainedEpsilonGreedy( self.env.stateaction_space, greed) self.safety_maximization_policy = SafetyMaximization( self.env.stateaction_space) self.active_sampling_policy = SafetyActiveSampling( self.env.stateaction_space) self.keep_seed_in_data = keep_seed_in_data if not keep_seed_in_data: self.Q_model.empty_data() self.violated_soft_constraint = None self.updated_safety = None
def load_models(self, skip_local=False): from edge.model.safety_models import MaternSafety from edge.model.value_models import GPQLearning models_names = list(self.get_models_to_save().keys()) loaders = { 'Q_model': lambda mpath: GPQLearning(mpath, self.env, self.q_x_seed, self. q_y_seed), 'safety_model': lambda mpath: MaternSafety(mpath, self.env, self.gamma_optimistic, self.s_x_seed, self.s_y_seed), } for mname in models_names: if not skip_local: load_path = self.local_models_path / mname else: load_path = self.models_path / mname setattr(self.agent, mname, loaders[mname](load_path))
def test_save_load(self): env = Hovership() x_seed = np.array([1.45, 0.6]) y_seed = np.array([0.8]) x_blank = np.array([0., 0]) y_blank = np.array([0.]) hyperparameters = { 'outputscale_prior': (0.4, 2), 'lengthscale_prior': (0.2, 0.2), 'noise_prior': (0.001, 0.002) } safety = MaternSafety(env, 0.7, x_seed, y_seed, hyperparameters) tmpdir = 'results/' #tempfile.TemporaryDirectory().name safety.save(tmpdir) safety.save_samples(tmpdir + 'samples.npz') blank = MaternSafety.load(tmpdir, env, 0.7, x_blank, y_blank) blank.load_samples(tmpdir + 'samples.npz') self.assertTrue((blank.gp.train_x == safety.gp.train_x).all()) self.assertEqual(blank.gp.structure_dict, safety.gp.structure_dict)
def __init__(self, env, greed, step_size, discount_rate, q_x_seed, q_y_seed, gamma_optimistic, gamma_cautious, lambda_cautious, s_x_seed, s_y_seed, q_gp_params=None, s_gp_params=None, keep_seed_in_data=True): """ Initializer :param env: the environment :param greed: the epsilon parameter of the ConstrainedEpsilonGreedy policy :param step_size: the step size in the Q-Learning update :param discount_rate: the discount rate :param q_x_seed: the seed input of the GP for the Q-Values model :param q_y_seed: the seed output of the GP for the Q-Values model :param gamma_optimistic: the gamma parameter for Q_optimistic :param gamma_cautious: the gamma parameter for Q_cautious :param lambda_cautious: the lambda parameter for Q_cautious :param s_x_seed: the seed input of the GP for the safety model :param s_y_seed: the seed output of the GP for the safety model :param q_gp_params: the parameters defining the GP for the Q-Values model. See edge.models.inference.MaternGP for more information :param q_gp_params: the parameters defining the GP for the safety model. See edge.models.inference.MaternGP for more information :param keep_seed_in_data: whether to keep the seed data in the GPs datasets. Should be True, otherwise GPyTorch fails. """ self.lambda_cautious_start, self.lambda_cautious_end = lambda_cautious self.gamma_cautious_start, self.gamma_cautious_end = gamma_cautious self.gamma_optimistic_start, self.gamma_optimistic_end = \ gamma_optimistic self.lambda_cautious = self.lambda_cautious_start self.gamma_cautious = self.gamma_cautious_start self._step_size_decrease_index = 1 Q_model = GPQLearning(env.stateaction_space, step_size, discount_rate, x_seed=q_x_seed, y_seed=q_y_seed, gp_params=q_gp_params) safety_model = MaternSafety(env.stateaction_space, self.gamma_optimistic_start, x_seed=s_x_seed, y_seed=s_y_seed, gp_params=s_gp_params) super(ValuesAndSafetyCombinator, self).__init__( env=env, greed=greed, # Unused: we define another policy step_size=step_size, discount_rate=discount_rate, x_seed=q_x_seed, y_seed=q_y_seed, gp_params=q_gp_params, keep_seed_in_data=keep_seed_in_data) self.Q_model = Q_model self.safety_model = safety_model self.constrained_value_policy = ConstrainedEpsilonGreedy( self.env.stateaction_space, greed) self.safety_maximization_policy = SafetyMaximization( self.env.stateaction_space) self._training_greed = self.greed self.keep_seed_in_data = keep_seed_in_data if not keep_seed_in_data: self.Q_model.empty_data()
class ControlledSafetyLearner(Agent): def __init__(self, env, s_gp_params, gamma_cautious, lambda_cautious, gamma_optimistic, checks_safety=True, learn_safety=True, is_free_from_safety=False, always_update_safety=False, safety_model=None, *models): self.gamma_cautious_s, self.gamma_cautious_e = gamma_cautious self.lambda_cautious_s, self.lambda_cautious_e = lambda_cautious self.gamma_optimistic_s, self.gamma_optimistic_e = gamma_optimistic self.gamma_cautious = self.gamma_cautious_s self.lambda_cautious = self.lambda_cautious_s if safety_model is not None: self.safety_model = safety_model else: x_seed = s_gp_params.pop('train_x') y_seed = s_gp_params.pop('train_y') self.safety_model = MaternSafety( env, gamma_measure=self.gamma_optimistic_s, x_seed=x_seed, y_seed=y_seed, gp_params=s_gp_params ) super().__init__(env, self.safety_model, *models) self.safety_learning_policy = SafetyInformationMaximization( env.stateaction_space ) self.safe_projection_policy = SafeProjectionPolicy( env.stateaction_space ) self.safety_maximization_policy = SafetyMaximization( self.env.stateaction_space ) self.active_sampling_policy = SafetyActiveSampling( self.env.stateaction_space ) self.last_controller_action = None self.safety_update = None self.checks_safety = checks_safety self.followed_controller = None self.always_update_safety = always_update_safety self.violated_constraint = None self.is_free_from_safety = is_free_from_safety self.learn_safety = learn_safety def get_controller_action(self, *args, **kwargs): raise NotImplementedError @property def gamma_optimistic(self): return self.safety_model.gamma_measure @gamma_optimistic.setter def gamma_optimistic(self, new_gamma_optimistic): self.safety_model.gamma_measure = new_gamma_optimistic @property def do_safety_update(self): return self.learn_safety and ( True # self.always_update_safety # or self.violated_constraint # or (not self.followed_controller) # or self.failed ) def update_safety_params(self, t): self.gamma_cautious = affine_interpolation(t, self.gamma_cautious_s, self.gamma_cautious_e) self.lambda_cautious = affine_interpolation(t, self.lambda_cautious_s, self.lambda_cautious_e) self.gamma_optimistic = affine_interpolation(t, self.gamma_optimistic_s, self.gamma_optimistic_e) def __get_projection_with_thresholds(self, lambda_t, gamma_t, original_action): constraints = self.safety_model.level_set( self.state, lambda_threshold=lambda_t, gamma_threshold=gamma_t ) projected_action = self.safe_projection_policy.get_action( to_project=original_action, constraints=constraints ) return projected_action def __get_alternative_with_thresholds(self, lambda_t, gamma_t, maximize_safety_proba=False, use_covar_slice=False): alt_set, safety_proba, covar_slice, covar_matrix = \ self.safety_model.level_set( self.state, lambda_threshold=lambda_t, gamma_threshold=gamma_t, return_proba=True, return_covar=True, return_covar_matrix=True, ) if not maximize_safety_proba: alt_set = alt_set.squeeze() if alt_set.any(): ctrlr_idx = self.env.action_space.get_index_of( self.last_controller_action, around_ok=True ) if use_covar_slice: alternative = self.active_sampling_policy( covar_slice.squeeze(), alt_set ) else: alternative = self.safety_learning_policy.get_action( covar_matrix[ctrlr_idx, :].squeeze(), alt_set ) return alternative else: return None else: safety_proba = safety_proba.squeeze() return self.safety_maximization_policy.get_action(safety_proba) def get_next_action(self): self.followed_controller = True self.violated_constraint = False self.last_controller_action = self.get_controller_action() action = self.last_controller_action if self.checks_safety: controller_is_cautious = self.safety_model.is_in_level_set( self.state, action, self.lambda_cautious, self.gamma_cautious ) if not controller_is_cautious: if self.is_free_from_safety: self.violated_constraint = True else: # alternative = self.__get_alternative_with_thresholds( # self.lambda_cautious, self.gamma_cautious, # use_covar_slice=False # ) alternative = self.__get_projection_with_thresholds( self.lambda_cautious, self.gamma_cautious, action ) if alternative is not None: # We found a cautious alternative self.violated_constraint = False self.followed_controller = False action = alternative else: self.violated_constraint = True self.followed_controller = False # alternative = self.__get_alternative_with_thresholds( # 0., self.gamma_optimistic # ) alternative = self.__get_projection_with_thresholds( 0., self.gamma_optimistic, action ) if alternative is not None: # We found an optimistic alternative action = alternative else: # No cautious or optimistic action available: # maximize safety probability action = self.__get_alternative_with_thresholds( 0., self.gamma_optimistic, maximize_safety_proba=True ) return action def update_models(self, state, action, next_state, reward, failed, done): return self.safety_model.update(state, action, next_state, reward, failed, done) def step(self): """ Chooses an action according to the policy, takes a step in the Environment, and updates the models. The action taken is available in self.last_action. :return: new_state, reward, failed """ old_state = self.state self.last_action = self.get_next_action() self.state, reward, failed = self.env.step(self.last_action) done = self.env.done if self.training_mode and self.do_safety_update: self.safety_update = self.update_models( old_state, self.last_action, self.state, reward, failed, done ) else: self.safety_update = None return self.state, reward, failed, done
def load_model(self): self.agent.safety_model = MaternSafety.load(str(self.model_path), self.env, self.agent.safety_model.gamma_measure, self.x_seed, self.y_seed )