def get_value_func_dict(self, pol: Policy) -> VFDictType: vf = {s: 0. for s in self.mdp_obj.all_states} epsilon = self.tol * 1e4 mo = self.mdp_obj pd = pol.policy_data rew = mdp_rep_to_mrp_rep2(mo.rewards, pd) prob = mdp_rep_to_mrp_rep1(mo.transitions, pd) while epsilon >= self.tol: new_vf = {s: rew[s] + mo.gamma * sum(p * vf[s1] for s1, p in prob[s].items()) for s in mo.all_states} epsilon = max(abs(new_vf[s] - v) for s, v in vf.items()) vf = new_vf return vf
def get_mrp(self, pol: Policy) -> MRP: tr = mdp_rep_to_mrp_rep1(self.transitions, pol.policy_data) rew = mdp_rep_to_mrp_rep2(self.rewards, pol.policy_data) return MRP({s: (v, rew[s]) for s, v in tr.items()}, self.gamma)