예제 #1
0
    def __init__(self,
                 actions,
                 gamma=0.9,
                 r_max=1.,
                 v_max=None,
                 deduce_v_max=True,
                 n_known=None,
                 deduce_n_known=True,
                 epsilon_q=0.1,
                 epsilon_m=None,
                 delta=None,
                 n_states=None,
                 min_sampling_probability=0.1,
                 name="MaxQInit"):
        """
        :param actions: action space of the environment
        :param gamma: (float) discount factor
        :param r_max: (float) known upper-bound on the reward function
        :param v_max: (float) known upper-bound on the value function
        :param deduce_v_max: (bool) set to True to deduce v_max from r_max
        :param n_known: (int) count after which a state-action pair is considered known
        (only set n_known if delta and epsilon are not defined)
        :param deduce_n_known: (bool) set to True to deduce n_known from (delta, n_states, epsilon_m)
        :param epsilon_q: (float) precision of value iteration algorithm for Q-value computation
        :param epsilon_m: (float) precision of the learned models in L1 norm
        :param delta: (float) models are learned epsilon_m-closely with probability at least 1 - delta
        :param n_states: (int) number of states

        :param min_sampling_probability: (float) minimum sampling probability of an environment
        :param name: (str)
        """
        RMax.__init__(self,
                      actions=actions,
                      gamma=gamma,
                      r_max=r_max,
                      v_max=v_max,
                      deduce_v_max=deduce_v_max,
                      n_known=n_known,
                      deduce_n_known=deduce_n_known,
                      epsilon_q=epsilon_q,
                      epsilon_m=epsilon_m,
                      delta=delta,
                      n_states=n_states,
                      name=name)

        self.min_sampling_probability = min_sampling_probability
        self.SA_memory = defaultdict(lambda: defaultdict(lambda: False))
        self.U_memory = []  # Upper-bounds on the Q-values of previous MDPs
        self.n_required_tasks = number_of_tasks_for_high_confidence_upper_bound(
            delta, min_sampling_probability)
예제 #2
0
    def __init__(self,
                 actions,
                 gamma=.9,
                 r_max=1.,
                 v_max=None,
                 deduce_v_max=True,
                 n_known=None,
                 deduce_n_known=True,
                 epsilon_q=0.1,
                 epsilon_m=None,
                 delta=None,
                 n_states=None,
                 name="ExpRMax",
                 path='results/'):
        RMax.__init__(self,
                      actions=actions,
                      gamma=gamma,
                      r_max=r_max,
                      v_max=v_max,
                      deduce_v_max=deduce_v_max,
                      n_known=n_known,
                      deduce_n_known=deduce_n_known,
                      epsilon_q=epsilon_q,
                      epsilon_m=epsilon_m,
                      delta=delta,
                      n_states=n_states,
                      name=name)

        # Recorded variables
        self.discounted_return = 0.
        self.total_return = 0.
        self.n_time_steps = 0  # nb of time steps
        self.update_time_steps = []  # time steps where a model update occurred

        self.path = path
        self.instance_number = 0
        self.run_number = 0
예제 #3
0
    def __init__(self,
                 actions,
                 gamma=.9,
                 r_max=1.,
                 v_max=None,
                 deduce_v_max=True,
                 n_known=None,
                 deduce_n_known=True,
                 epsilon_q=0.1,
                 epsilon_m=None,
                 delta=None,
                 n_states=None,
                 max_memory_size=None,
                 prior=None,
                 estimate_distances_online=True,
                 min_sampling_probability=.1,
                 name="LRMax"):
        """
        :param actions: action space of the environment
        :param gamma: (float) discount factor
        :param r_max: (float) known upper-bound on the reward function
        :param v_max: (float) known upper-bound on the value function
        :param deduce_v_max: (bool) set to True to deduce v_max from r_max
        :param n_known: (int) count after which a state-action pair is considered known
        :param deduce_n_known: (bool) set to True to deduce n_known from (delta, n_states, epsilon_m)
        (only set n_known if delta and epsilon are not defined)
        :param epsilon_q: (float) precision of value iteration algorithm for Q-value computation
        :param epsilon_m: (float) precision of the learned models in L1 norm
        :param delta: (float) models are learned epsilon_m-closely with probability at least 1 - delta
        :param n_states: (int) number of states

        :param max_memory_size: (int) maximum number of saved models (infinity if None)
        :param prior: (float) prior knowledge of maximum model's distance
        :param estimate_distances_online: (bool) set to True for online estimation of a tighter upper-bound for the
        model pseudo-distances. The estimation is valid with high probability.
        :param min_sampling_probability: (float) minimum sampling probability of an environment
        :param name: (str)
        """
        self.name = name
        RMax.__init__(self,
                      actions=actions,
                      gamma=gamma,
                      r_max=r_max,
                      v_max=v_max,
                      deduce_v_max=deduce_v_max,
                      n_known=n_known,
                      deduce_n_known=deduce_n_known,
                      epsilon_q=epsilon_q,
                      epsilon_m=epsilon_m,
                      delta=delta,
                      n_states=n_states,
                      name=name)

        # Lifelong Learning memories
        self.max_memory_size = max_memory_size
        self.U_memory = []
        self.R_memory = []
        self.T_memory = []
        self.SA_memory = defaultdict(lambda: defaultdict(lambda: False))

        self.U_lip = []
        self.b = self.epsilon_m * (1. + self.gamma * self.v_max)

        # Prior knowledge on maximum model distance
        prior_max = self.r_max + self.gamma * 2. * self.v_max
        self.prior = prior_max if prior is None else min(prior, prior_max)
        self.prior = round(self.prior, 2)

        # Online distances estimation
        self.estimate_distances_online = estimate_distances_online
        self.min_sampling_probability = min_sampling_probability
        self.D = defaultdict(lambda: defaultdict(lambda: prior_max)
                             )  # Dictionary of distances (high probability)
        self.n_samples_high_confidence = compute_n_samples_high_confidence(
            min_sampling_probability, delta)

        self.update_upper_bound()