return self.r_mat env = OneStateMDP() env_with_model = OneStateMDPWithModel() # Test Value Iteration V_star, pi_star = value_iteration(env_with_model,np.zeros(env_with_model.spec.nS),1e-4) assert np.allclose(V_star,np.array([1.,0.]),1e-5,1e-2), V_star assert pi_star.action(0) == 0 eval_policy = pi_star behavior_policy = RandomPolicy(env.spec.nA) # Test Value Prediction V, Q = value_prediction(env_with_model,eval_policy,np.zeros(env.spec.nS),1e-4) assert np.allclose(V,np.array([1.,0.]),1e-5,1e-2), V assert np.allclose(Q,np.array([[1.,0.],[0.,0.]]),1e-5,1e-2), Q V, Q = value_prediction(env_with_model,behavior_policy,np.zeros(env.spec.nS),1e-4) assert np.allclose(V,np.array([0.1,0.]),1e-5,1e-2), V assert np.allclose(Q,np.array([[0.19,0.],[0.,0.]]),1e-5,1e-2), Q # Gather experience using behavior policy N_EPISODES = 100000 trajs = [] for _ in tqdm(range(N_EPISODES)): states, actions, rewards, done =\ [env.reset()], [], [], []
print("Generating episodes based on random policy") trajs = [] for _ in tqdm(range(N_EPISODES)): s = grid_world.reset() traj = [] while s != 0 and s != 15: a = behavior_policy.action(s) next_s, r, _ = grid_world.step(a) traj.append((s, a, r, next_s)) s = next_s trajs.append(traj) print("DP value prediction under random policy") V, Q = value_prediction(grid_world, behavior_policy, initV, 1e-12) print(V.reshape((4, 4))) print("DP value iteration optimal value and policy") V, pi = value_iteration(grid_world, initV, 1e-12) print(V.reshape((4, 4))) print(visualize(pi).reshape((4, 4))) # On-policy evaluation tests for random policy # OIS Q_est_ois = mc_ois(grid_world.spec, trajs, behavior_policy, behavior_policy, np.zeros((grid_world.spec.nS, grid_world.spec.nA))) # WIS Q_est_wis = mc_wis(grid_world.spec, trajs, behavior_policy, behavior_policy,