Exemplo n.º 1
0
            return self.r_mat

    env = OneStateMDP()
    env_with_model = OneStateMDPWithModel()

    # Test Value Iteration
    V_star, pi_star = value_iteration(env_with_model,np.zeros(env_with_model.spec.nS),1e-4)

    assert np.allclose(V_star,np.array([1.,0.]),1e-5,1e-2), V_star
    assert pi_star.action(0) == 0

    eval_policy = pi_star
    behavior_policy = RandomPolicy(env.spec.nA)

    # Test Value Prediction
    V, Q = value_prediction(env_with_model,eval_policy,np.zeros(env.spec.nS),1e-4)

    assert np.allclose(V,np.array([1.,0.]),1e-5,1e-2), V
    assert np.allclose(Q,np.array([[1.,0.],[0.,0.]]),1e-5,1e-2), Q

    V, Q = value_prediction(env_with_model,behavior_policy,np.zeros(env.spec.nS),1e-4)
    assert np.allclose(V,np.array([0.1,0.]),1e-5,1e-2), V
    assert np.allclose(Q,np.array([[0.19,0.],[0.,0.]]),1e-5,1e-2), Q

    # Gather experience using behavior policy
    N_EPISODES = 100000

    trajs = []
    for _ in tqdm(range(N_EPISODES)):
        states, actions, rewards, done =\
            [env.reset()], [], [], []
Exemplo n.º 2
0
    print("Generating episodes based on random policy")
    trajs = []
    for _ in tqdm(range(N_EPISODES)):
        s = grid_world.reset()
        traj = []

        while s != 0 and s != 15:
            a = behavior_policy.action(s)
            next_s, r, _ = grid_world.step(a)
            traj.append((s, a, r, next_s))
            s = next_s
        trajs.append(traj)

    print("DP value prediction under random policy")
    V, Q = value_prediction(grid_world, behavior_policy, initV, 1e-12)
    print(V.reshape((4, 4)))

    print("DP value iteration optimal value and policy")
    V, pi = value_iteration(grid_world, initV, 1e-12)
    print(V.reshape((4, 4)))
    print(visualize(pi).reshape((4, 4)))

    # On-policy evaluation tests for random policy
    # OIS
    Q_est_ois = mc_ois(grid_world.spec, trajs, behavior_policy,
                       behavior_policy,
                       np.zeros((grid_world.spec.nS, grid_world.spec.nA)))
    # WIS
    Q_est_wis = mc_wis(grid_world.spec, trajs, behavior_policy,
                       behavior_policy,