def generate_vi(mdp, c, lr=0.1): init_pi = utils.random_policy(mdp.S,mdp.A) init_v = utils.value_functional(mdp.P, mdp.r, init_pi, mdp.discount) vs = np.stack(utils.solve(ss.value_iteration(mdp, lr), init_v))[:,:,0] n = vs.shape[0] plt.scatter(vs[0, 0], vs[0, 1], c=c, s=30, label='{}'.format(n)) plt.scatter(vs[1:-1, 0], vs[1:-1, 1], c=range(n-2), cmap='viridis', s=10) plt.scatter(vs[-1, 0], vs[-1, 1], c='m', marker='x')
def VI(init, M, f): # solve V_init = utils.value_functional(M.P, M.r, init, M.discount) V_star = utils.solve(ss.value_iteration(M, 0.01), V_init)[-1] # lift return np.dot(f.T, V_star)
def value_iteration(mdp, pis, lr): trajs = [] for pi in pis: init_V = utils.value_functional(mdp.P, mdp.r, pi, mdp.discount) traj = utils.solve(ss.value_iteration(mdp, lr), init_V) v_star = traj[-1] trajs.append(traj) return trajs
def value_iteration(mdp, pis): lens, pi_stars = [], [] for pi in pis: init_V = utils.value_functional(mdp.P, mdp.r, pi, mdp.discount) pi_traj = utils.solve(ss.value_iteration(mdp, 0.01), init_V) pi_star = pi_traj[-1] pi_stars.append(pi_star) lens.append(len(pi_traj)) return lens, pi_stars
def mom_value_iteration(mdp, pis): lens, pi_stars = [], [] for pi in pis: init_V = utils.value_functional(mdp.P, mdp.r, pi, mdp.discount) pi_traj = utils.solve( ss.momentum_bundler(ss.value_iteration(mdp, 0.01), 0.9), (init_V, np.zeros_like(init_V))) pi_star, _ = pi_traj[-1] pi_stars.append(pi_star) lens.append(len(pi_traj)) return lens, pi_stars