def value_iteration_optimality(mdp, gamma): it = value_iteration(mdp, gamma) vf1 = next(it) vf2 = next(it) while max(abs(np.array(list(vf2.values())) - np.array(list(vf1.values())))) != 0.0: vf3 = next(it) vf1 = vf2 vf2 = vf3 opt_vf = vf2 opt_policy = greedy_policy_from_vf(mdp, opt_vf, gamma) #print("opt_vf : ", opt_vf) #print("opt_policy : ", opt_policy) return (opt_vf, opt_policy)
def policy_iteration( mdp: FiniteMarkovDecisionProcess, gamma: float, tolerance: float, max_iters: int ) -> Tuple[V[S], FinitePolicy]: """Implement policy iteration on a finite MDP. :param mdp: Object representation of a finite Markov decision process :param gamma: Discount factor :param tolerance: Difference in maximum value functions between iterations for convergence :param max_iters: Maximum number of iterations to allow :returns: Optimal value function :returns: Optimal policy """ vf, pi = initialize(mdp) n_iter = 0 while True: n_iter += 1 delta = 0 v = vf.copy() mrp: FiniteMarkovRewardProcess[S] = mdp.apply_finite_policy(pi) # Policy evaluation vf: V[S] = {mrp.non_terminal_states[i]: v for i, v in enumerate( mrp.get_value_function_vec(gamma) )} diffs = np.absolute(np.subtract(list(vf.values()), list(v.values()))) diffs = np.append(diffs, delta) delta = np.max(diffs) # Policy improvement pi: FinitePolicy[S, A] = dp.greedy_policy_from_vf( mdp, vf, gamma ) if n_iter == max_iters: print("Maximum iterations reached.") return vf, pi if delta < tolerance: return vf, pi
def frog_escape_value_iteration(frog_MDP: fe.FrogProblemMDP, gamma: float = 1, max_iters: int = 1000) -> mdp.FinitePolicy: """Get optimal policy for 'Frog Escape' problem by value iteration. :param frog_MDP: MDP representation of the 'Frog Escape' problem. :param gamma: Discount factor (default 1) :param max_iters: Maximum number of value iterations to execute (default 1000) """ value_iterator: Iterator[dp.V[dp.S]] = dp.value_iteration(mdp=frog_MDP, gamma=gamma) vf_errors: Tuple[dp.V[dp.S], np.ndarray] = iterate(iterator=value_iterator, max_iters=max_iters) pi: mdp.FinitePolicy[dp.S, dp.A] = dp.greedy_policy_from_vf(mdp=frog_MDP, vf=vf_errors[0][0], gamma=gamma) return (vf_errors[0][0], pi), vf_errors[1]
def solution( model: FiniteMarkovDecisionProcess[Coordinate, Action], gamma: float ) -> Tuple[int, Mapping[Coordinate, float], FinitePolicy[Coordinate, Action]]: count = 0 v = value_iteration(model, gamma) a = next(v, None) while True: if a is None: break b = next(v, None) if max(abs(a[s] - b[s]) for s in a) < TOLERANCE: break a = b count += 1 opt_policy: FinitePolicy[Coordinate, Action] = greedy_policy_from_vf( model, b, gamma) return count, b, opt_policy