def test_indexing(self):
        hovership_params = {'shape': (100, 5)}
        env = Hovership(random_start=True,
                        dynamics_parameters=hovership_params)
        x_seed = np.array([1., 1.])
        y_seed = np.array([1])
        hyperparameters = {
            'outputscale_prior': (1, 0.1),
            'lengthscale_prior': (0.2, 0.05),
            'noise_prior': (0.001, 0.001)
        }
        gpqlearning = GPQLearning(env,
                                  0.9,
                                  0.9,
                                  x_seed=x_seed,
                                  y_seed=y_seed,
                                  gp_params=hyperparameters)

        query = gpqlearning._get_query_from_index(
            (np.array([0.5]), slice(None, None, None)))
        self.assertEqual(query.shape, (5, 2))
        self.assertTrue(np.all(query[:, 0] == 0.5))

        pred = gpqlearning.gp.predict(query).mean.cpu().numpy()
        self.assertEqual(pred.shape, (5, ))
    def test_policy_convergence(self):
        hovership_params = {'shape': (100, 2)}
        env = Hovership(random_start=True,
                        dynamics_parameters=hovership_params)
        hyperparameters = {
            'outputscale_prior': (1, 0.1),
            'lengthscale_prior': (0.2, 0.05),
            'noise_prior': (0.001, 0.001)
        }
        x_seed = np.array([0.85, 1.])
        y_seed = np.array([1.])
        gpqlearning = GPQLearning(env,
                                  0.9,
                                  0.9,
                                  x_seed=x_seed,
                                  y_seed=y_seed,
                                  gp_params=hyperparameters)
        nA = env.action_space.index_shape[0]
        eps = 0.1
        for episode in range(3):
            state = env.reset()
            failed = env.has_failed
            n_steps = 0
            while not failed and n_steps < 50:
                probas = np.ones(nA) * eps / nA
                probas[np.argmax(gpqlearning[state, :])] += 1 - eps
                action = env.action_space[np.random.choice(nA, p=probas)]
                new_state, reward, failed = env.step(action)
                print(f'Step {n_steps} - State {state} - New state {new_state}'
                      f' - Action - {action} - Reward {reward} - Failed '
                      f'{failed}')
                gpqlearning.update(state, action, new_state, reward, failed)
                state = new_state
                n_steps += 1

        def policy_from_gpq(gpq):
            q_values = gpq[:, :].reshape(gpq.env.stateaction_space.index_shape)
            policy = np.zeros_like(q_values)
            for i, _ in iter(env.state_space):
                policy[i, np.argmax(q_values[i, :])] = 1
            return policy

        policy = policy_from_gpq(gpqlearning)
        print("The computation of the policy works, but "
              "the convergence value is not tested. "
              f"Policy:\n{policy}")
        self.assertTrue(True)
示例#3
0
    def load_models(self, skip_local=False):
        model_name = list(self.get_models_to_save().keys())[0]
        if not skip_local:
            load_path = self.local_models_path / model_name

        else:
            load_path = self.models_path / model_name
        self.agent.value_model = GPQLearning.load(load_path)
示例#4
0
 def load_models(self, skip_local=False):
     model_name = list(self.get_models_to_save().keys())[0]
     if not skip_local:
         load_path = self.local_models_path / model_name
     else:
         load_path = self.models_path / model_name
     self.agent.value_model = GPQLearning.load(load_path,
                                               self.env.staetaction_space,
                                               self.x_seed, self.y_seed)
示例#5
0
    def __init__(self, env,
                 greed, step_size, discount_rate, q_x_seed, q_y_seed,
                 gamma_optimistic, gamma_hard, lambda_hard, gamma_soft, s_x_seed, s_y_seed,
                 q_gp_params=None, s_gp_params=None, keep_seed_in_data=True):
        """
        Initializer
        :param env: the environment
        :param greed: the epsilon parameter of the ConstrainedEpsilonGreedy policy
        :param q_step_size: the step size in the Q-Learning update
        :param discount_rate: the discount rate
        :param q_x_seed: the seed input of the GP for the Q-Values model
        :param q_y_seed: the seed output of the GP for the Q-Values model
        :param gamma_optimistic: the gamma parameter for Q_optimistic
        :param gamma_hard: the gamma parameter for Q_hard, the set where Q-Learning is constrained (~ Q_cautious)
        :param lambda_hard: the lambda parameter for Q_hard AND Q_soft
        :param gamma_soft: the gamma parameter for Q_soft, the set outside of which the safety measure is updated
        :param s_x_seed: the seed input of the GP for the safety model
        :param s_y_seed: the seed output of the GP for the safety model
        :param q_gp_params: the parameters defining the GP for the Q-Values model. See edge.models.inference.MaternGP
            for more information
        :param q_gp_params: the parameters defining the GP for the safety model. See edge.models.inference.MaternGP
            for more information
        :param keep_seed_in_data: whether to keep the seed data in the GPs datasets. Should be True, otherwise GPyTorch
            fails.
        """
        Q_model = GPQLearning(env.stateaction_space, step_size, discount_rate,
                              x_seed=q_x_seed, y_seed=q_y_seed,
                              gp_params=q_gp_params)
        safety_model = MaternSafety(env.stateaction_space, gamma_optimistic,
                                    x_seed=s_x_seed, y_seed=s_y_seed,
                                    gp_params=s_gp_params)
        super(SoftHardLearner, self).__init__(env, Q_model, safety_model)

        self.Q_model = Q_model
        self.safety_model = safety_model
        self.lambda_hard = lambda_hard
        self.gamma_hard = gamma_hard
        self.gamma_soft = gamma_soft
        self._gamma_optimistic = gamma_optimistic

        self.constrained_value_policy = ConstrainedEpsilonGreedy(
            self.env.stateaction_space, greed)
        self.safety_maximization_policy = SafetyMaximization(
            self.env.stateaction_space)
        self.active_sampling_policy = SafetyActiveSampling(
            self.env.stateaction_space)

        self.keep_seed_in_data = keep_seed_in_data
        if not keep_seed_in_data:
            self.Q_model.empty_data()

        self.violated_soft_constraint = None
        self.updated_safety = None
示例#6
0
文件: q_learner.py 项目: sheim/edge
    def __init__(self,
                 env,
                 safety_measure,
                 greed,
                 step_size,
                 discount_rate,
                 safety_threshold,
                 x_seed,
                 y_seed,
                 gp_params=None,
                 keep_seed_in_data=True):
        """
        Initializer
        :param env: the environment
        :param safety_measure: either SafetyTruth or SafetyModel of the environment
        :param greed: the epsilon parameter of the ConstrainedEpsilonGreedy policy
        :param step_size: the step size in the Q-Learning update
        :param discount_rate: the discount rate
        :param safety_threshold: the lambda threshold used to evaluate safety. This is 0 theoretically, but an Agent
            that is at the exact boundary of the viability kernel still fails due to rounding errors. Hence, this should
            be a small, positive value.
        :param x_seed: the seed input of the GP
        :param y_seed: the seed output of the GP
        :param gp_params: the parameters defining the GP. See edge.models.inference.MaternGP for more information
        :param keep_seed_in_data: whether to keep the seed data in the GP dataset. Should be True, otherwise GPyTorch
            fails.
        """
        Q_model = GPQLearning(env.stateaction_space,
                              step_size,
                              discount_rate,
                              x_seed=x_seed,
                              y_seed=y_seed,
                              gp_params=gp_params)
        super(ConstrainedQLearner, self).__init__(env, Q_model)

        self.Q_model = Q_model
        self.safety_measure = safety_measure
        self.constrained_value_policy = ConstrainedEpsilonGreedy(
            self.env.stateaction_space, greed)
        self.safety_maximization_policy = SafetyMaximization(
            self.safety_measure.stateaction_space)
        self.safety_threshold = safety_threshold
        self.keep_seed_in_data = keep_seed_in_data
        if not keep_seed_in_data:
            self.Q_model.empty_data()
示例#7
0
 def load_models(self, skip_local=False):
     from edge.model.safety_models import MaternSafety
     from edge.model.value_models import GPQLearning
     models_names = list(self.get_models_to_save().keys())
     loaders = {
         'Q_model':
         lambda mpath: GPQLearning(mpath, self.env, self.q_x_seed, self.
                                   q_y_seed),
         'safety_model':
         lambda mpath: MaternSafety(mpath, self.env, self.gamma_optimistic,
                                    self.s_x_seed, self.s_y_seed),
     }
     for mname in models_names:
         if not skip_local:
             load_path = self.local_models_path / mname
         else:
             load_path = self.models_path / mname
         setattr(self.agent, mname, loaders[mname](load_path))
示例#8
0
文件: q_learner.py 项目: sheim/edge
    def __init__(self,
                 env,
                 greed,
                 step_size,
                 discount_rate,
                 x_seed,
                 y_seed,
                 gp_params=None,
                 keep_seed_in_data=True):
        """
        Initializer
        :param env: the environment
        :param greed: the epsilon parameter of the EpsilonGreedy policy
        :param step_size: the step size in the Q-Learning update
        :param discount_rate: the discount rate
        :param x_seed: the seed input of the GP
        :param y_seed: the seed output of the GP
        :param gp_params: the parameters defining the GP. See edge.models.inference.MaternGP for more information
        :param keep_seed_in_data: whether to keep the seed data in the GP dataset. Should be True, otherwise GPyTorch
            fails.
        """
        Q_model = GPQLearning(env.stateaction_space,
                              step_size,
                              discount_rate,
                              x_seed=x_seed,
                              y_seed=y_seed,
                              gp_params=gp_params)
        super(QLearner, self).__init__(env, Q_model)

        self.Q_model = Q_model
        self.policy = EpsilonGreedy(env, greed)
        self.keep_seed_in_data = keep_seed_in_data
        if not keep_seed_in_data:
            self.Q_model.empty_data()

        self._step_size_decrease_index = 1
示例#9
0
    def __init__(self,
                 env,
                 greed,
                 step_size,
                 discount_rate,
                 q_x_seed,
                 q_y_seed,
                 gamma_optimistic,
                 gamma_cautious,
                 lambda_cautious,
                 s_x_seed,
                 s_y_seed,
                 q_gp_params=None,
                 s_gp_params=None,
                 keep_seed_in_data=True):
        """
        Initializer
        :param env: the environment
        :param greed: the epsilon parameter of the ConstrainedEpsilonGreedy
            policy
        :param step_size: the step size in the Q-Learning update
        :param discount_rate: the discount rate
        :param q_x_seed: the seed input of the GP for the Q-Values model
        :param q_y_seed: the seed output of the GP for the Q-Values model
        :param gamma_optimistic: the gamma parameter for Q_optimistic
        :param gamma_cautious: the gamma parameter for Q_cautious
        :param lambda_cautious: the lambda parameter for Q_cautious
        :param s_x_seed: the seed input of the GP for the safety model
        :param s_y_seed: the seed output of the GP for the safety model
        :param q_gp_params: the parameters defining the GP for the Q-Values
            model. See edge.models.inference.MaternGP
            for more information
        :param q_gp_params: the parameters defining the GP for the safety model.
            See edge.models.inference.MaternGP for more information
        :param keep_seed_in_data: whether to keep the seed data in the GPs
            datasets. Should be True, otherwise GPyTorch fails.
        """
        self.lambda_cautious_start, self.lambda_cautious_end = lambda_cautious
        self.gamma_cautious_start, self.gamma_cautious_end = gamma_cautious
        self.gamma_optimistic_start, self.gamma_optimistic_end = \
            gamma_optimistic
        self.lambda_cautious = self.lambda_cautious_start
        self.gamma_cautious = self.gamma_cautious_start

        self._step_size_decrease_index = 1

        Q_model = GPQLearning(env.stateaction_space,
                              step_size,
                              discount_rate,
                              x_seed=q_x_seed,
                              y_seed=q_y_seed,
                              gp_params=q_gp_params)
        safety_model = MaternSafety(env.stateaction_space,
                                    self.gamma_optimistic_start,
                                    x_seed=s_x_seed,
                                    y_seed=s_y_seed,
                                    gp_params=s_gp_params)
        super(ValuesAndSafetyCombinator, self).__init__(
            env=env,
            greed=greed,  # Unused: we define another policy
            step_size=step_size,
            discount_rate=discount_rate,
            x_seed=q_x_seed,
            y_seed=q_y_seed,
            gp_params=q_gp_params,
            keep_seed_in_data=keep_seed_in_data)

        self.Q_model = Q_model
        self.safety_model = safety_model

        self.constrained_value_policy = ConstrainedEpsilonGreedy(
            self.env.stateaction_space, greed)
        self.safety_maximization_policy = SafetyMaximization(
            self.env.stateaction_space)
        self._training_greed = self.greed

        self.keep_seed_in_data = keep_seed_in_data
        if not keep_seed_in_data:
            self.Q_model.empty_data()