Exemplo n.º 1
0
def test_bellman_operator_monotonicity_and_contraction(gamma, seed):
    env = ToyEnv1(seed)
    V0 = np.array([1.0, 100.0, 1000.0])
    V1 = np.array([2.0, 120.0, 1200.0])

    policy_array = np.array([[0.2, 0.8], [0.5, 0.5], [0.9, 0.9]])
    policy = FinitePolicy(policy_array, seed)

    dp_agent = DynProgAgent(env, gamma=gamma)

    TV0, _ = dp_agent.bellman_opt_operator(V0)
    TV1, _ = dp_agent.bellman_opt_operator(V1)

    TpiV0 = dp_agent.bellman_operator(V0, policy)
    TpiV1 = dp_agent.bellman_operator(V1, policy)

    # Test monotonicity
    assert np.greater(TV0, TV1).sum() == 0
    assert np.greater(TpiV0, TpiV1).sum() == 0

    # Test contraction
    norm_tv = np.abs(TV1 - TV0).max()
    norm_v = np.abs(V1 - V0).max()
    assert norm_tv <= gamma * norm_v
Exemplo n.º 2
0
def test_value_and_policy_iteration_gridworld(sx, sy, gamma):
    # Tolerance
    tol = 1e-8

    # Environment
    env = GridWorld(nrows=sx, ncols=sy)

    dp_agent_val = DynProgAgent(env, gamma=gamma, method='value-iteration')
    dp_agent_pol = DynProgAgent(env, gamma=gamma, method='policy-iteration')
    V_value_it, _ = dp_agent_val.train(val_it_tol=tol)
    V_pol_it, _ = dp_agent_pol.train()

    assert np.allclose(V_value_it, V_pol_it, atol=tol, rtol=1e2 * tol)
Exemplo n.º 3
0
def test_value_and_policy_iteration(gamma, seed, Ns, Na):
    # Tolerance
    tol = 1e-8

    # Environment
    env = ToyEnv2(Ns, Na, seed)

    dp_agent_val = DynProgAgent(env, gamma=gamma, method='value-iteration')
    dp_agent_pol = DynProgAgent(env, gamma=gamma, method='policy-iteration')
    V_value_it, _ = dp_agent_val.train(val_it_tol=tol)
    V_pol_it, _ = dp_agent_pol.train()

    assert dp_agent_val.policy == dp_agent_pol.policy
    assert np.allclose(V_value_it, V_pol_it, atol=tol, rtol=1e1 * tol)
Exemplo n.º 4
0
        assert nrows >= 3
        assert ncols >= 3

        # defining walls
        middle_col = ncols // 2
        middle_row = nrows // 2
        walls = ()
        for row in range(nrows):
            if row != middle_row:
                walls += ((row, middle_col), )
        #

        super().__init__(seed_val, nrows, ncols, start_coord, terminal_states,
                         success_probability, reward_at, walls, default_reward,
                         enable_render)


if __name__ == '__main__':
    gw = TwoRoomDense(9, 9, success_probability=1.0)

    from rlplan.agents.planning import DynProgAgent
    dynprog = DynProgAgent(gw, method='policy-iteration', gamma=0.9)
    V, _ = dynprog.train()
    gw.display_values(V)

    # run
    gw.render(mode='auto', policy=dynprog.policy)

    # reset
    gw.reset()
Exemplo n.º 5
0
        return self.gamma*mu + self.r/len(self.sampled_nodes)


if __name__=='__main__':
    from rlplan.agents.planning import DynProgAgent
    from rlplan.envs.toy import ToyEnv1
    import numpy as np

    # Define parameters
    gamma = 0.1  # discount factor
    seed = 55  # random seed

    # Initialize environment
    env = ToyEnv1(seed_val=seed)

    # ----------------------------------------------------------
    # Finding the exact value function
    # ----------------------------------------------------------
    dynprog = DynProgAgent(env, method='policy-iteration', gamma=gamma)
    V, _ = dynprog.train()

    # ----------------------------------------------------------
    # TrailBlazer
    # ----------------------------------------------------------
    state = env.reset()
    tb = TrailBlazer(state, env, gamma=gamma, delta=0.1, epsilon=1.0)
    val = tb.run()

    print("Value function = ", V)
    print("TrailBlazer estimate of V[%d] = %f" % (state, val))
Exemplo n.º 6
0
from rlplan.agents.planning import DynProgAgent
from rlplan.envs.toy import ToyEnv1
from rlplan.envs.gridworld import GridWorld
from rlplan.envs import Chain
from rlplan.prediction import TabularTD

# Discount factor
gamma = 0.9

# Create environment
# env = Chain(10)
# env = ToyEnv1()
env = GridWorld(success_probability=0.9, nrows=4, ncols=4, walls=((1, 1), ))

# Initialize and train dynamic programming agent
dp_agent = DynProgAgent(env, method='policy-iteration', gamma=gamma)
V_dp, _ = dp_agent.train()

# Initialize and train q-learning agent
ql_agent = QLearningAgent(env,
                          gamma=gamma,
                          learning_rate=None,
                          min_learning_rate=0.1,
                          epsilon=0.2)
training_info = ql_agent.train(n_steps=1e5)
V_ql = training_info['V']

# Use tabular TD
tab_td = TabularTD(env,
                   dp_agent.policy,
                   gamma,
Exemplo n.º 7
0
ql_agent = QLearningUcbAgent(env,
                             gamma=gamma,
                             learning_rate=None,
                             min_learning_rate=0.1,
                             c_expl=4.0)

training_info = ql_agent.train(n_steps=1000, eval_params={'n_sim': 10})

# # Visualize policy
# env.reset()
# env.render(mode='auto', policy=ql_agent.policy)
#
# # Visualize training curve
# ql_agent.plot_rewards(training_info['rewards_list'], training_info['x_data'], show=True)

dp_agent = DynProgAgent(env, gamma=gamma, method='policy-iteration')

# Draw history
# draw_grid_world_state_distribution(ql_agent.env)

action_freq = get_action_frequency(ql_agent.env)
policy_freq = FinitePolicy(action_freq)

# visualize_exploration(ql_agent.env, show=False)
# env.render('manual')
# plt.show()
#
env_eval.reset()
env_eval.render(policy=ql_agent.policy)
env_eval.reset()
env_eval.render(policy=policy_freq)
Exemplo n.º 8
0
    """

    action_freq = np.zeros((env.Ns, env.Na))
    H = len(env.history)

    for ii in range(H):
        state, action, reward, next_state, done = env.history[ii]
        action_freq[state, action] += 1.0

    for state in range(env.Ns):
        action_freq[
            state, :] = action_freq[state, :] / action_freq[state, :].sum()

    return action_freq


if __name__ == '__main__':
    from rlplan.envs import GridWorld
    from rlplan.agents.planning import DynProgAgent

    env = GridWorld()
    dp_agent = DynProgAgent(env, method='policy-iteration', gamma=0.9)
    dp_agent.train()

    env.track = True
    for step in range(15):
        env.step(dp_agent.policy.sample(env.state))
    draw_gridworld_history(env)

    env.render('manual')