Exemplo n.º 1
0
 def get_mrp_refined(self, pol: Policy) -> MRPRefined:
     tr = mdp_rep_to_mrp_rep1(self.transitions, pol.policy_data)
     rew_ref = mdp_rep_to_mrp_rep1(self.rewards_refined, pol.policy_data)
     return MRPRefined(
         {
             s: {s1: (v1, rew_ref[s][s1])
                 for s1, v1 in v.items()}
             for s, v in tr.items()
         }, self.gamma)
Exemplo n.º 2
0
    def get_mrp_refined(self, pol: Policy) -> MRPRefined:
        flat_transitions = flatten_sasf_dict(self.transitions)
        flat_rewards_refined = flatten_sasf_dict(self.rewards_refined)

        flat_exp_rewards = merge_dicts(flat_rewards_refined, flat_transitions, lambda x, y: x * y)
        exp_rewards = unflatten_sasf_dict(flat_exp_rewards)

        tr = mdp_rep_to_mrp_rep1(self.transitions, pol.policy_data)
        rew_ref = mdp_rep_to_mrp_rep1(
            exp_rewards,
            pol.policy_data
        )
        flat_tr = flatten_ssf_dict(tr)
        flat_rew_ref = flatten_ssf_dict(rew_ref)
        flat_norm_rewards = merge_dicts(flat_rew_ref, flat_tr, lambda x, y: x / y)
        norm_rewards = unflatten_ssf_dict(flat_norm_rewards)

        return MRPRefined(
            {s: {s1: (v1, norm_rewards[s][s1]) for s1, v1 in v.items()}
             for s, v in tr.items()},
            self.gamma
        )
 def get_value_func_dict(self, pol: Policy) -> VFDictType:
     vf = {s: 0. for s in self.mdp_obj.all_states}
     epsilon = self.tol * 1e4
     mo = self.mdp_obj
     pd = pol.policy_data
     rew = mdp_rep_to_mrp_rep2(mo.rewards, pd)
     prob = mdp_rep_to_mrp_rep1(mo.transitions, pd)
     while epsilon >= self.tol:
         new_vf = {s: rew[s] + mo.gamma * sum(p * vf[s1]
                                              for s1, p in prob[s].items())
                   for s in mo.all_states}
         
         epsilon = max(abs(new_vf[s] - v) for s, v in vf.items())
         vf = new_vf
     return vf
Exemplo n.º 4
0
 def get_mrp(self, pol: Policy) -> MRP:
     tr = mdp_rep_to_mrp_rep1(self.transitions, pol.policy_data)
     rew = mdp_rep_to_mrp_rep2(self.rewards, pol.policy_data)
     return MRP({s: (v, rew[s]) for s, v in tr.items()}, self.gamma)