Пример #1
0
 def update_upper_bound(self):
     """
     Update the total upper bound on the Q-value function.
     Called at initialization and when a new state-action pair is known.
     :return: None
     """
     self.update_lipschitz_upper_bounds()
     self.initialize_upper_bound()
     RMax.update_upper_bound(self)
Пример #2
0
    def reset(self):
        """
        Reset the attributes to initial state (called between instances).
        :return: None
        """
        self.update_memory()

        RMax.reset(self)

        if len(self.U_memory) > self.n_required_tasks:
            self.update_max_q_init_upper_bound()
Пример #3
0
    def __init__(self,
                 actions,
                 gamma=0.9,
                 r_max=1.,
                 v_max=None,
                 deduce_v_max=True,
                 n_known=None,
                 deduce_n_known=True,
                 epsilon_q=0.1,
                 epsilon_m=None,
                 delta=None,
                 n_states=None,
                 min_sampling_probability=0.1,
                 name="MaxQInit"):
        """
        :param actions: action space of the environment
        :param gamma: (float) discount factor
        :param r_max: (float) known upper-bound on the reward function
        :param v_max: (float) known upper-bound on the value function
        :param deduce_v_max: (bool) set to True to deduce v_max from r_max
        :param n_known: (int) count after which a state-action pair is considered known
        (only set n_known if delta and epsilon are not defined)
        :param deduce_n_known: (bool) set to True to deduce n_known from (delta, n_states, epsilon_m)
        :param epsilon_q: (float) precision of value iteration algorithm for Q-value computation
        :param epsilon_m: (float) precision of the learned models in L1 norm
        :param delta: (float) models are learned epsilon_m-closely with probability at least 1 - delta
        :param n_states: (int) number of states

        :param min_sampling_probability: (float) minimum sampling probability of an environment
        :param name: (str)
        """
        RMax.__init__(self,
                      actions=actions,
                      gamma=gamma,
                      r_max=r_max,
                      v_max=v_max,
                      deduce_v_max=deduce_v_max,
                      n_known=n_known,
                      deduce_n_known=deduce_n_known,
                      epsilon_q=epsilon_q,
                      epsilon_m=epsilon_m,
                      delta=delta,
                      n_states=n_states,
                      name=name)

        self.min_sampling_probability = min_sampling_probability
        self.SA_memory = defaultdict(lambda: defaultdict(lambda: False))
        self.U_memory = []  # Upper-bounds on the Q-values of previous MDPs
        self.n_required_tasks = number_of_tasks_for_high_confidence_upper_bound(
            delta, min_sampling_probability)
Пример #4
0
    def reset(self):
        """
        Reset the attributes to initial state (called between instances).
        Save the previous model.
        :return: None
        """
        RMax.reset(self)

        self.write(init=False)

        # Reset recorded variables between MDPs
        self.discounted_return = 0.
        self.total_return = 0.
        self.n_time_steps = 0
        self.update_time_steps = []
Пример #5
0
    def reset(self):
        """
        Reset the attributes to initial state (called between instances).
        Save the previous model.
        :return: None
        """
        # Save previously learned model
        if len(self.counter) > 0 and (self.max_memory_size is None or
                                      len(self.U_lip) < self.max_memory_size):
            self.update_memory()

        RMax.reset(self)

        if self.estimate_distances_online:
            self.update_max_distances()
        self.update_upper_bound()
Пример #6
0
def experiment(p):
    # Parameters
    gamma = .9
    n_env = 5
    size = p['size']
    env_distribution = make_env_distribution(
        env_class='tight', n_env=n_env, gamma=gamma,
        env_name=p['name'],
        w=size,
        h=size,
        stochastic=p['stochastic']
    )
    actions = env_distribution.get_actions()
    n_known = p['n_known']
    p_min = 1. / float(n_env)
    epsilon_q = .01
    epsilon_m = .01
    delta = .1
    r_max = 1.
    v_max = p['v_max']
    n_states = 4
    max_mem = 1

    # Agents
    rmax = RMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known,
                deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, name='RMax')
    lrmax = LRMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known,
                  deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states,
                  max_memory_size=max_mem, prior=None, estimate_distances_online=True,
                  min_sampling_probability=p_min, name='LRMax')
    lrmax_p01 = LRMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known,
                      deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states,
                      max_memory_size=max_mem, prior=0.1, estimate_distances_online=True,
                      min_sampling_probability=p_min, name='LRMax(Dmax=0.1)')
    lrmax_p02 = LRMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known,
                      deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states,
                      max_memory_size=max_mem, prior=0.2, estimate_distances_online=True,
                      min_sampling_probability=p_min, name='LRMax(Dmax=0.2)')
    maxqinit = MaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known,
                        deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states,
                        min_sampling_probability=p_min, name='MaxQInit')
    lrmaxqinit = LRMaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known,
                            deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta,
                            n_states=n_states, max_memory_size=max_mem, prior=None, estimate_distances_online=True,
                            min_sampling_probability=p_min, name='LRMaxQInit')
    lrmaxqinit_p01 = LRMaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known,
                                deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta,
                                n_states=n_states, max_memory_size=max_mem, prior=0.1, estimate_distances_online=True,
                                min_sampling_probability=p_min, name='LRMaxQInit(Dmax=0.1)')
    lrmaxqinit_p02 = LRMaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known,
                                deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta,
                                n_states=n_states, max_memory_size=max_mem, prior=0.2, estimate_distances_online=True,
                                min_sampling_probability=p_min, name='LRMaxQInit(Dmax=0.2)')
    agents_pool = [rmax, lrmax, lrmax_p01, lrmax_p02, maxqinit, lrmaxqinit, lrmaxqinit_p01, lrmaxqinit_p02]

    # Run
    run_agents_lifelong(agents_pool, env_distribution, n_instances=3, n_tasks=p['n_tasks'], n_episodes=p['n_episodes'],
                        n_steps=p['n_steps'],
                        reset_at_terminal=False, open_plot=False, plot_title=True, do_run=True, do_plot=True,
                        parallel_run=True, n_processes=None)
Пример #7
0
def experiment():
    n_env = 5
    env_distribution = make_env_distribution(env_class='maze-mono-goal', env_name='maze-mono-goal', n_env=n_env, gamma=GAMMA)
    actions = env_distribution.get_actions()
    p_min = 1. / float(n_env)
    delta = .1

    m = 100
    max_mem = 10
    rmax = RMax(actions=actions, gamma=GAMMA, count_threshold=m)
    rmax_q = MaxQInit(actions=actions, gamma=GAMMA, count_threshold=m, min_sampling_probability=p_min, delta=delta)
    lrmax1 = LRMax(actions=actions, gamma=GAMMA, count_threshold=m, max_memory_size=max_mem, prior=1.)
    lrmax05 = LRMax(actions=actions, gamma=GAMMA, count_threshold=m, max_memory_size=max_mem, prior=0.5)
    lrmax02 = LRMax(actions=actions, gamma=GAMMA, count_threshold=m, max_memory_size=max_mem, prior=0.2)
    lrmax_learn = LRMax(
        actions=actions, gamma=GAMMA, count_threshold=m, max_memory_size=max_mem, prior=None,
        min_sampling_probability=p_min, delta=delta
    )

    agents_pool = [rmax, lrmax1, lrmax05, lrmax02, lrmax_learn, rmax_q]

    run_agents_lifelong(
        agents_pool, env_distribution, samples=20, episodes=100, steps=1000, reset_at_terminal=False,
        open_plot=True, cumulative_plot=False, is_tracked_value_discounted=True, plot_only=False, plot_title=False
    )
Пример #8
0
def experiment():
    n_env = 5
    env_distribution = make_env_distribution(env_class='corridor',
                                             n_env=n_env,
                                             gamma=GAMMA,
                                             w=20,
                                             h=1)
    actions = env_distribution.get_actions()
    p_min = 1. / float(n_env)
    delta = .1

    m = 1
    max_mem = 2
    rmax = RMax(actions=actions, gamma=GAMMA, count_threshold=m)
    rmax_q = MaxQInit(actions=actions,
                      gamma=GAMMA,
                      count_threshold=m,
                      min_sampling_probability=p_min,
                      delta=delta)
    lrmax0_2 = LRMaxCT(actions=actions,
                       gamma=GAMMA,
                       count_threshold=m,
                       max_memory_size=max_mem,
                       prior=0.2)
    lrmax0_6 = LRMaxCT(actions=actions,
                       gamma=GAMMA,
                       count_threshold=m,
                       max_memory_size=max_mem,
                       prior=0.6)
    lrmax1_0 = LRMaxCT(actions=actions,
                       gamma=GAMMA,
                       count_threshold=m,
                       max_memory_size=max_mem,
                       prior=1.0)
    lrmax_learn = LRMaxCT(actions=actions,
                          gamma=GAMMA,
                          count_threshold=m,
                          max_memory_size=max_mem,
                          prior=None,
                          min_sampling_probability=p_min,
                          delta=delta)

    agents_pool = [rmax, lrmax1_0, lrmax0_6, lrmax0_2, lrmax_learn, rmax_q]

    run_agents_lifelong(agents_pool,
                        env_distribution,
                        samples=20,
                        episodes=20,
                        steps=10,
                        reset_at_terminal=False,
                        open_plot=True,
                        cumulative_plot=False,
                        is_tracked_value_discounted=False,
                        plot_only=False,
                        plot_title=False)
Пример #9
0
    def __init__(self,
                 actions,
                 gamma=.9,
                 r_max=1.,
                 v_max=None,
                 deduce_v_max=True,
                 n_known=None,
                 deduce_n_known=True,
                 epsilon_q=0.1,
                 epsilon_m=None,
                 delta=None,
                 n_states=None,
                 name="ExpRMax",
                 path='results/'):
        RMax.__init__(self,
                      actions=actions,
                      gamma=gamma,
                      r_max=r_max,
                      v_max=v_max,
                      deduce_v_max=deduce_v_max,
                      n_known=n_known,
                      deduce_n_known=deduce_n_known,
                      epsilon_q=epsilon_q,
                      epsilon_m=epsilon_m,
                      delta=delta,
                      n_states=n_states,
                      name=name)

        # Recorded variables
        self.discounted_return = 0.
        self.total_return = 0.
        self.n_time_steps = 0  # nb of time steps
        self.update_time_steps = []  # time steps where a model update occurred

        self.path = path
        self.instance_number = 0
        self.run_number = 0
def experiment():
    # Parameters
    gamma = .9
    env_distribution = make_env_distribution(env_class='deterministic-super-tight',
                                             env_name='deterministic-super-tight-bignknown',
                                             gamma=gamma)
    actions = env_distribution.get_actions()
    n_known = 100
    p_min = 1. / 3.
    epsilon_q = .01
    epsilon_m = .01
    delta = .1
    r_max = 1.
    v_max = 1.
    n_states = 4
    max_mem = 9

    # Agents
    rmax = RMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known,
                deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, name='RMax')
    lrmax = LRMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known,
                  deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states,
                  max_memory_size=max_mem, prior=None, estimate_distances_online=True,
                  min_sampling_probability=p_min, name='LRMax')
    lrmaxprior = LRMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known,
                       deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states,
                       max_memory_size=max_mem, prior=0.1, estimate_distances_online=True,
                       min_sampling_probability=p_min, name='LRMax(Dmax=0.1)')
    maxqinit = MaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known,
                        deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states,
                        min_sampling_probability=p_min, name='MaxQInit')
    lrmaxqinit = LRMaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known,
                            deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta,
                            n_states=n_states, max_memory_size=max_mem, prior=None, estimate_distances_online=True,
                            min_sampling_probability=p_min, name='LRMaxQInit')
    lrmaxqinitprior = LRMaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known,
                                 deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta,
                                 n_states=n_states, max_memory_size=max_mem, prior=0.1, estimate_distances_online=True,
                                 min_sampling_probability=p_min, name='LRMaxQInit(Dmax=0.1)')
    agents_pool = [rmax, lrmax, lrmaxprior, maxqinit, lrmaxqinit, lrmaxqinitprior]

    # Run
    run_agents_lifelong(agents_pool, env_distribution, n_instances=1, n_tasks=100, n_episodes=200, n_steps=100,
                        reset_at_terminal=False, open_plot=False, plot_title=True, do_run=False, do_plot=True,
                        parallel_run=True, n_processes=None)
Пример #11
0
def main():
    # Setup MDP.
    w = 6
    h = 6
    mdp = GridWorld(width=w,
                    height=h,
                    init_loc=(1, 1),
                    goal_locs=[(6, 6)],
                    slip_prob=.1)

    # Setup Agents.
    rand_agent = RandomAgent(actions=mdp.get_actions())
    ql_agent = QLearningAgent(actions=mdp.get_actions())

    # Compute number of samples for R-MAX to achieve epsilon optimal behavior with high probability (1 - delta)
    compute_n_samples = False
    if compute_n_samples:
        epsilon = .1
        delta = .05
        m_r = np.log(2. / delta) / (2. * epsilon**2)
        m_t = 2. * (np.log(2**(float(w * h)) - 2.) - np.log(delta)) / (epsilon
                                                                       **2)
        n_samples = int(max(m_r, m_t))
    else:
        n_samples = 30

    simple_rl_rmax_agent = RMaxAgent(actions=mdp.get_actions(),
                                     gamma=.9,
                                     horizon=3,
                                     s_a_threshold=n_samples,
                                     name='SimpleRL-R-MAX')
    rmax_agent = RMax(actions=mdp.get_actions(),
                      gamma=.9,
                      count_threshold=n_samples)

    # Run experiment and make plot.
    run_agents_on_mdp([rand_agent, ql_agent, rmax_agent, simple_rl_rmax_agent],
                      mdp,
                      instances=5,
                      episodes=100,
                      steps=20,
                      reset_at_terminal=True,
                      verbose=False)
Пример #12
0
def experiment():
    # Parameters
    gamma = .9
    env_distribution = make_env_distribution(env_class='stochastic-tight', env_name='stochastic-tight', gamma=gamma)
    actions = env_distribution.get_actions()
    n_known = 10
    p_min = 1. / 7.  # There are seven possible MDPs
    epsilon_q = .1
    epsilon_m = .01
    delta = .1
    r_max = 1.
    v_max = 1.
    n_states = 4
    max_mem = 10

    # Agents
    rmax = RMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known,
                deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, name='RMax')
    lrmax = LRMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known,
                  deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states,
                  max_memory_size=max_mem, prior=None, estimate_distances_online=True,
                  min_sampling_probability=p_min, name='LRMax')
    lrmaxprior = LRMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known,
                       deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states,
                       max_memory_size=max_mem, prior=0.2, estimate_distances_online=True,
                       min_sampling_probability=p_min, name='LRMax(Dmax=0.2)')
    maxqinit = MaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known,
                        deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states,
                        min_sampling_probability=p_min, name='MaxQInit')
    lrmaxqinit = LRMaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known,
                            deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta,
                            n_states=n_states, max_memory_size=max_mem, prior=None, estimate_distances_online=True,
                            min_sampling_probability=p_min, name='LRMaxQInit')
    agents_pool = [rmax, lrmax, lrmaxprior, maxqinit]  # , lrmaxqinit]

    # Run
    run_agents_lifelong(agents_pool, env_distribution, n_instances=5, n_tasks=50, n_episodes=50, n_steps=100,
                        reset_at_terminal=False, plot_only=False, open_plot=True, plot_title=True)
Пример #13
0
def example():
    n_env = 4
    env_distribution = make_env_distribution(env_class='test',
                                             n_env=n_env,
                                             gamma=GAMMA,
                                             w=60,
                                             h=20)
    actions = env_distribution.get_actions()

    m = 1  # Count threshold
    max_mem = None
    p_min = 1. / float(n_env)
    delta = 0.99
    lrmax = LRMax(actions=actions,
                  gamma=GAMMA,
                  count_threshold=m,
                  max_memory_size=max_mem,
                  prior=None,
                  min_sampling_probability=p_min,
                  delta=delta)
    rmax_max_q_init = MaxQInit(actions=actions,
                               gamma=GAMMA,
                               count_threshold=m,
                               min_sampling_probability=p_min,
                               delta=delta)
    rmax = RMax(actions=actions, gamma=GAMMA, count_threshold=m)

    run_agents_lifelong([rmax_max_q_init, lrmax, rmax],
                        env_distribution,
                        samples=10,
                        episodes=10,
                        steps=100,
                        reset_at_terminal=False,
                        open_plot=True,
                        cumulative_plot=False,
                        is_tracked_value_discounted=True,
                        plot_only=False)
Пример #14
0
def experiment():
    # Parameters
    gamma = .9
    n_env = 5
    n_states = 20
    env_distribution = make_env_distribution(env_class='corridor',
                                             n_env=n_env,
                                             gamma=gamma,
                                             w=n_states,
                                             h=1)
    actions = env_distribution.get_actions()
    n_known = 1
    p_min = 1. / float(n_env)
    r_max = 1.
    v_max = 10.
    epsilon_q = .01
    epsilon_m = .01
    delta = .1
    max_mem = 1

    # Agents
    rmax = RMax(actions=actions,
                gamma=gamma,
                r_max=r_max,
                v_max=v_max,
                deduce_v_max=False,
                n_known=n_known,
                deduce_n_known=False,
                epsilon_q=epsilon_q,
                epsilon_m=epsilon_m,
                name='RMax')
    lrmax = LRMax(actions=actions,
                  gamma=gamma,
                  r_max=r_max,
                  v_max=v_max,
                  deduce_v_max=False,
                  n_known=n_known,
                  deduce_n_known=False,
                  epsilon_q=epsilon_q,
                  epsilon_m=epsilon_m,
                  delta=delta,
                  n_states=n_states,
                  max_memory_size=max_mem,
                  prior=None,
                  estimate_distances_online=True,
                  min_sampling_probability=p_min,
                  name='LRMax')
    lrmaxprior02 = LRMax(actions=actions,
                         gamma=gamma,
                         r_max=r_max,
                         v_max=v_max,
                         deduce_v_max=False,
                         n_known=n_known,
                         deduce_n_known=False,
                         epsilon_q=epsilon_q,
                         epsilon_m=epsilon_m,
                         delta=delta,
                         n_states=n_states,
                         max_memory_size=max_mem,
                         prior=0.2,
                         estimate_distances_online=False,
                         min_sampling_probability=p_min,
                         name='LRMax(0.2)')
    maxqinit = MaxQInit(actions=actions,
                        gamma=gamma,
                        r_max=r_max,
                        v_max=v_max,
                        deduce_v_max=False,
                        n_known=n_known,
                        deduce_n_known=False,
                        epsilon_q=epsilon_q,
                        epsilon_m=epsilon_m,
                        delta=delta,
                        n_states=n_states,
                        min_sampling_probability=p_min,
                        name='MaxQInit')
    lrmaxqinit = LRMaxQInit(actions=actions,
                            gamma=gamma,
                            r_max=r_max,
                            v_max=v_max,
                            deduce_v_max=False,
                            n_known=n_known,
                            deduce_n_known=False,
                            epsilon_q=epsilon_q,
                            epsilon_m=epsilon_m,
                            delta=delta,
                            n_states=n_states,
                            max_memory_size=max_mem,
                            prior=None,
                            estimate_distances_online=True,
                            min_sampling_probability=p_min,
                            name='LRMaxQInit')
    lrmaxqinitprior02 = LRMaxQInit(actions=actions,
                                   gamma=gamma,
                                   r_max=r_max,
                                   v_max=v_max,
                                   deduce_v_max=False,
                                   n_known=n_known,
                                   deduce_n_known=False,
                                   epsilon_q=epsilon_q,
                                   epsilon_m=epsilon_m,
                                   delta=delta,
                                   n_states=n_states,
                                   max_memory_size=max_mem,
                                   prior=0.2,
                                   estimate_distances_online=True,
                                   min_sampling_probability=p_min,
                                   name='LRMaxQInit(0.2)')
    agents_pool = [
        rmax, lrmax, lrmaxprior02, maxqinit, lrmaxqinit, lrmaxqinitprior02
    ]

    # Run
    run_agents_lifelong(agents_pool,
                        env_distribution,
                        name_identifier=None,
                        n_instances=1,
                        n_tasks=20,
                        n_episodes=20,
                        n_steps=11,
                        reset_at_terminal=False,
                        do_run=False,
                        do_plot=True,
                        open_plot=False,
                        episodes_moving_average=False,
                        episodes_ma_width=10,
                        tasks_moving_average=False,
                        tasks_ma_width=10,
                        latex_rendering=True,
                        plot_title=False)
Пример #15
0
    def __init__(self,
                 actions,
                 gamma=.9,
                 r_max=1.,
                 v_max=None,
                 deduce_v_max=True,
                 n_known=None,
                 deduce_n_known=True,
                 epsilon_q=0.1,
                 epsilon_m=None,
                 delta=None,
                 n_states=None,
                 max_memory_size=None,
                 prior=None,
                 estimate_distances_online=True,
                 min_sampling_probability=.1,
                 name="LRMax"):
        """
        :param actions: action space of the environment
        :param gamma: (float) discount factor
        :param r_max: (float) known upper-bound on the reward function
        :param v_max: (float) known upper-bound on the value function
        :param deduce_v_max: (bool) set to True to deduce v_max from r_max
        :param n_known: (int) count after which a state-action pair is considered known
        :param deduce_n_known: (bool) set to True to deduce n_known from (delta, n_states, epsilon_m)
        (only set n_known if delta and epsilon are not defined)
        :param epsilon_q: (float) precision of value iteration algorithm for Q-value computation
        :param epsilon_m: (float) precision of the learned models in L1 norm
        :param delta: (float) models are learned epsilon_m-closely with probability at least 1 - delta
        :param n_states: (int) number of states

        :param max_memory_size: (int) maximum number of saved models (infinity if None)
        :param prior: (float) prior knowledge of maximum model's distance
        :param estimate_distances_online: (bool) set to True for online estimation of a tighter upper-bound for the
        model pseudo-distances. The estimation is valid with high probability.
        :param min_sampling_probability: (float) minimum sampling probability of an environment
        :param name: (str)
        """
        self.name = name
        RMax.__init__(self,
                      actions=actions,
                      gamma=gamma,
                      r_max=r_max,
                      v_max=v_max,
                      deduce_v_max=deduce_v_max,
                      n_known=n_known,
                      deduce_n_known=deduce_n_known,
                      epsilon_q=epsilon_q,
                      epsilon_m=epsilon_m,
                      delta=delta,
                      n_states=n_states,
                      name=name)

        # Lifelong Learning memories
        self.max_memory_size = max_memory_size
        self.U_memory = []
        self.R_memory = []
        self.T_memory = []
        self.SA_memory = defaultdict(lambda: defaultdict(lambda: False))

        self.U_lip = []
        self.b = self.epsilon_m * (1. + self.gamma * self.v_max)

        # Prior knowledge on maximum model distance
        prior_max = self.r_max + self.gamma * 2. * self.v_max
        self.prior = prior_max if prior is None else min(prior, prior_max)
        self.prior = round(self.prior, 2)

        # Online distances estimation
        self.estimate_distances_online = estimate_distances_online
        self.min_sampling_probability = min_sampling_probability
        self.D = defaultdict(lambda: defaultdict(lambda: prior_max)
                             )  # Dictionary of distances (high probability)
        self.n_samples_high_confidence = compute_n_samples_high_confidence(
            min_sampling_probability, delta)

        self.update_upper_bound()
Пример #16
0
def experiment():
    # Parameters
    gamma = .9
    n_env = 5
    w, h = 20, 20
    n_states = w * h
    env_distribution = make_env_distribution(
        env_class='grid-world',
        env_name='grid-world-two-goals-large',
        n_env=n_env,
        gamma=gamma,
        w=w,
        h=h)
    actions = env_distribution.get_actions()
    n_known = 1
    p_min = 1. / float(n_env)
    r_max = 1.
    v_max = 10.
    epsilon_q = .01
    epsilon_m = .01
    delta = .1
    max_mem = 1

    # Agents
    rmax = RMax(actions=actions,
                gamma=gamma,
                r_max=r_max,
                v_max=v_max,
                deduce_v_max=False,
                n_known=n_known,
                deduce_n_known=False,
                epsilon_q=epsilon_q,
                epsilon_m=epsilon_m,
                name='RMax')
    lrmax = LRMax(actions=actions,
                  gamma=gamma,
                  r_max=r_max,
                  v_max=v_max,
                  deduce_v_max=False,
                  n_known=n_known,
                  deduce_n_known=False,
                  epsilon_q=epsilon_q,
                  epsilon_m=epsilon_m,
                  delta=delta,
                  n_states=n_states,
                  max_memory_size=max_mem,
                  prior=None,
                  estimate_distances_online=True,
                  min_sampling_probability=p_min,
                  name='LRMax')
    lrmaxprior02 = LRMax(actions=actions,
                         gamma=gamma,
                         r_max=r_max,
                         v_max=v_max,
                         deduce_v_max=False,
                         n_known=n_known,
                         deduce_n_known=False,
                         epsilon_q=epsilon_q,
                         epsilon_m=epsilon_m,
                         delta=delta,
                         n_states=n_states,
                         max_memory_size=max_mem,
                         prior=0.2,
                         estimate_distances_online=False,
                         min_sampling_probability=p_min,
                         name='LRMax(Dmax0.2)')
    maxqinit = MaxQInit(actions=actions,
                        gamma=gamma,
                        r_max=r_max,
                        v_max=v_max,
                        deduce_v_max=False,
                        n_known=n_known,
                        deduce_n_known=False,
                        epsilon_q=epsilon_q,
                        epsilon_m=epsilon_m,
                        delta=delta,
                        n_states=n_states,
                        min_sampling_probability=p_min,
                        name='MaxQInit')
    lrmaxqinit = LRMaxQInit(actions=actions,
                            gamma=gamma,
                            r_max=r_max,
                            v_max=v_max,
                            deduce_v_max=False,
                            n_known=n_known,
                            deduce_n_known=False,
                            epsilon_q=epsilon_q,
                            epsilon_m=epsilon_m,
                            delta=delta,
                            n_states=n_states,
                            max_memory_size=max_mem,
                            prior=None,
                            estimate_distances_online=True,
                            min_sampling_probability=p_min,
                            name='LRMaxQInit')
    lrmaxqinitprior02 = LRMaxQInit(actions=actions,
                                   gamma=gamma,
                                   r_max=r_max,
                                   v_max=v_max,
                                   deduce_v_max=False,
                                   n_known=n_known,
                                   deduce_n_known=False,
                                   epsilon_q=epsilon_q,
                                   epsilon_m=epsilon_m,
                                   delta=delta,
                                   n_states=n_states,
                                   max_memory_size=max_mem,
                                   prior=0.2,
                                   estimate_distances_online=True,
                                   min_sampling_probability=p_min,
                                   name='LRMaxQInit(Dmax0.2)')
    agents_pool = [
        rmax, lrmax, lrmaxprior02, maxqinit, lrmaxqinit, lrmaxqinitprior02
    ]

    # Run
    run_agents_lifelong(agents_pool,
                        env_distribution,
                        name_identifier=None,
                        n_instances=1,
                        n_tasks=100,
                        n_episodes=100,
                        n_steps=13,
                        reset_at_terminal=False,
                        open_plot=False,
                        plot_title=True,
                        do_run=True,
                        do_plot=True,
                        parallel_run=True,
                        n_processes=None)
Пример #17
0
def experiment(p, name):
    # Parameters
    gamma = .9
    n_env = 5
    size = p['size']
    env_distribution = make_env_distribution(env_class='tight',
                                             n_env=n_env,
                                             gamma=gamma,
                                             env_name=name,
                                             version=p['version'],
                                             w=size,
                                             h=size,
                                             stochastic=p['stochastic'],
                                             verbose=False)
    actions = env_distribution.get_actions()
    n_known = p['n_known']
    p_min = 1. / n_env
    epsilon_q = .01
    epsilon_m = .01
    delta = .1
    r_max = 1.
    v_max = 10.
    n_states = 4
    max_mem = 1

    # Agents
    rmax = RMax(actions=actions,
                gamma=gamma,
                r_max=r_max,
                v_max=v_max,
                deduce_v_max=False,
                n_known=n_known,
                deduce_n_known=False,
                epsilon_q=epsilon_q,
                epsilon_m=epsilon_m,
                name='RMax')
    lrmax = LRMax(actions=actions,
                  gamma=gamma,
                  r_max=r_max,
                  v_max=v_max,
                  deduce_v_max=False,
                  n_known=n_known,
                  deduce_n_known=False,
                  epsilon_q=epsilon_q,
                  epsilon_m=epsilon_m,
                  delta=delta,
                  n_states=n_states,
                  max_memory_size=max_mem,
                  prior=None,
                  estimate_distances_online=True,
                  min_sampling_probability=p_min,
                  name='LRMax')
    lrmax_p01 = LRMax(actions=actions,
                      gamma=gamma,
                      r_max=r_max,
                      v_max=v_max,
                      deduce_v_max=False,
                      n_known=n_known,
                      deduce_n_known=False,
                      epsilon_q=epsilon_q,
                      epsilon_m=epsilon_m,
                      delta=delta,
                      n_states=n_states,
                      max_memory_size=max_mem,
                      prior=0.1,
                      estimate_distances_online=True,
                      min_sampling_probability=p_min,
                      name='LRMax(0.1)')
    lrmax_p015 = LRMax(actions=actions,
                       gamma=gamma,
                       r_max=r_max,
                       v_max=v_max,
                       deduce_v_max=False,
                       n_known=n_known,
                       deduce_n_known=False,
                       epsilon_q=epsilon_q,
                       epsilon_m=epsilon_m,
                       delta=delta,
                       n_states=n_states,
                       max_memory_size=max_mem,
                       prior=0.15,
                       estimate_distances_online=True,
                       min_sampling_probability=p_min,
                       name='LRMax(0.15)')
    lrmax_p02 = LRMax(actions=actions,
                      gamma=gamma,
                      r_max=r_max,
                      v_max=v_max,
                      deduce_v_max=False,
                      n_known=n_known,
                      deduce_n_known=False,
                      epsilon_q=epsilon_q,
                      epsilon_m=epsilon_m,
                      delta=delta,
                      n_states=n_states,
                      max_memory_size=max_mem,
                      prior=0.2,
                      estimate_distances_online=True,
                      min_sampling_probability=p_min,
                      name='LRMax(0.2)')
    maxqinit = MaxQInit(actions=actions,
                        gamma=gamma,
                        r_max=r_max,
                        v_max=v_max,
                        deduce_v_max=False,
                        n_known=n_known,
                        deduce_n_known=False,
                        epsilon_q=epsilon_q,
                        epsilon_m=epsilon_m,
                        delta=delta,
                        n_states=n_states,
                        min_sampling_probability=p_min,
                        name='MaxQInit')
    lrmaxqinit = LRMaxQInit(actions=actions,
                            gamma=gamma,
                            r_max=r_max,
                            v_max=v_max,
                            deduce_v_max=False,
                            n_known=n_known,
                            deduce_n_known=False,
                            epsilon_q=epsilon_q,
                            epsilon_m=epsilon_m,
                            delta=delta,
                            n_states=n_states,
                            max_memory_size=max_mem,
                            prior=None,
                            estimate_distances_online=True,
                            min_sampling_probability=p_min,
                            name='LRMaxQInit')
    lrmaxqinit_p01 = LRMaxQInit(actions=actions,
                                gamma=gamma,
                                r_max=r_max,
                                v_max=v_max,
                                deduce_v_max=False,
                                n_known=n_known,
                                deduce_n_known=False,
                                epsilon_q=epsilon_q,
                                epsilon_m=epsilon_m,
                                delta=delta,
                                n_states=n_states,
                                max_memory_size=max_mem,
                                prior=0.1,
                                estimate_distances_online=True,
                                min_sampling_probability=p_min,
                                name='LRMaxQInit(0.1)')
    lrmaxqinit_p015 = LRMaxQInit(actions=actions,
                                 gamma=gamma,
                                 r_max=r_max,
                                 v_max=v_max,
                                 deduce_v_max=False,
                                 n_known=n_known,
                                 deduce_n_known=False,
                                 epsilon_q=epsilon_q,
                                 epsilon_m=epsilon_m,
                                 delta=delta,
                                 n_states=n_states,
                                 max_memory_size=max_mem,
                                 prior=0.15,
                                 estimate_distances_online=True,
                                 min_sampling_probability=p_min,
                                 name='LRMaxQInit(0.15)')
    lrmaxqinit_p02 = LRMaxQInit(actions=actions,
                                gamma=gamma,
                                r_max=r_max,
                                v_max=v_max,
                                deduce_v_max=False,
                                n_known=n_known,
                                deduce_n_known=False,
                                epsilon_q=epsilon_q,
                                epsilon_m=epsilon_m,
                                delta=delta,
                                n_states=n_states,
                                max_memory_size=max_mem,
                                prior=0.2,
                                estimate_distances_online=True,
                                min_sampling_probability=p_min,
                                name='LRMaxQInit(0.2)')
    # agents_pool = [rmax, lrmax, lrmax_p01, lrmax_p015, lrmax_p02, maxqinit, lrmaxqinit, lrmaxqinit_p01, lrmaxqinit_p015, lrmaxqinit_p02]
    agents_pool = [
        rmax, lrmax, lrmax_p02, lrmax_p01, maxqinit, lrmaxqinit, lrmaxqinit_p01
    ]

    # Run
    run_agents_lifelong(agents_pool,
                        env_distribution,
                        n_instances=2,
                        n_tasks=p['n_tasks'],
                        n_episodes=p['n_episodes'],
                        n_steps=p['n_steps'],
                        reset_at_terminal=False,
                        open_plot=False,
                        plot_title=False,
                        plot_legend=2,
                        do_run=True,
                        do_plot=True,
                        parallel_run=True,
                        n_processes=None,
                        episodes_moving_average=True,
                        episodes_ma_width=100,
                        tasks_moving_average=False,
                        latex_rendering=True)