示例#1
0
def get_soft_policy_from_qf_dict(qf_dict: SAf, softmax: bool,
                                 epsilon: float) -> Policy:
    if softmax:
        ret = Policy(
            {s: get_softmax_action_probs(v)
             for s, v in qf_dict.items()})
    else:
        ret = Policy({
            s: get_epsilon_action_probs(v, epsilon)
            for s, v in qf_dict.items()
        })
    return ret
示例#2
0
    def get_mc_path(
        self,
        pol: Policy,
        start_state: S,
        start_action: Optional[A] = None,
    ) -> Sequence[Tuple[S, A, float, bool]]:

        res = []
        state = start_state
        steps = 0
        terminate = False
        occ_states = set()
        act_gen_dict = {s: get_rv_gen_func_single(pol.get_state_probabilities(s))
                        for s in self.mdp_rep.state_action_dict.keys()}

        while not terminate:
            first = state not in occ_states
            occ_states.add(state)
            action = act_gen_dict[state]()\
                if (steps > 0 or start_action is None) else start_action
            next_state, reward =\
                self.mdp_rep.state_reward_gen_dict[state][action]()
            res.append((state, action, reward, first))
            steps += 1
            terminate = steps >= self.max_steps or\
                state in self.mdp_rep.terminal_states
            state = next_state
        return res
示例#3
0
def get_vf_dict_from_qf_dict_and_policy(qf_dict: SAf,
                                        pol: Policy) -> Mapping[A, float]:
    return {
        s:
        sum(pol.get_state_action_probability(s, a) * q for a, q in v.items())
        for s, v in qf_dict.items()
    }
示例#4
0
    def get_value_func_dict(self, pol: Policy) -> VFDictType:
        sa_dict = self.mdp_rep.state_action_dict
        vf_dict = {s: 0.0 for s in sa_dict.keys()}
        act_gen_dict = {
            s: get_rv_gen_func_single(pol.get_state_probabilities(s))
            for s in sa_dict.keys()
        }
        episodes = 0
        updates = 0

        while episodes < self.num_episodes:
            state = self.mdp_rep.init_state_gen()
            steps = 0
            terminate = False

            while not terminate:
                action = act_gen_dict[state]()
                next_state, reward = \
                    self.mdp_rep.state_reward_gen_dict[state][action]()
                vf_dict[state] += self.learning_rate *\
                    (updates / self.learning_rate_decay + 1) ** -0.5 *\
                    (reward + self.mdp_rep.gamma * vf_dict[next_state] -
                     vf_dict[state])
                updates += 1
                steps += 1
                terminate = steps >= self.max_steps or \
                    state in self.mdp_rep.terminal_states
                state = next_state

            episodes += 1

        return vf_dict
示例#5
0
    def get_value_func(self, polf: PolicyType) -> Callable[[S], float]:
        pol = Policy({s: get_pdf_from_samples(
            polf(s)(len(v) * TabularBase.NUM_SAMPLES_PER_ACTION)
        ) for s, v in self.get_state_action_dict().items()})

        # noinspection PyShadowingNames
        def vf(state: S, pol=pol) -> float:
            return self.get_value_func_dict(pol)[state]

        return vf
示例#6
0
if __name__ == '__main__':
    from mdp_dp_rl.processes.mdp import MDP
    policy_data = {
        1: {
            'a': 0.4,
            'b': 0.6
        },
        2: {
            'a': 0.7,
            'c': 0.3
        },
        3: {
            'b': 1.0
        }
    }
    pol_obj = Policy(policy_data)
    mdp_data = {
        1: {
            'a': ({
                1: 0.2,
                2: 0.6,
                3: 0.2
            }, 7.0),
            'b': ({
                1: 0.6,
                2: 0.3,
                3: 0.1
            }, -2.0),
            'c': ({
                1: 0.1,
                2: 0.2,
示例#7
0
        elif i == 3:
            ret = {'b': 1.0}
        else:
            raise ValueError
        return ret

    this_qf = adp_obj.get_act_value_func_fa(policy_func, True)
    this_vf = adp_obj.get_value_func_fa(policy_func, True)
    print("Printing vf for a policy")
    print(this_vf(1))
    print(this_vf(2))
    print(this_vf(3))
    print("Printing DP vf for a policy")
    from mdp_dp_rl.processes.policy import Policy
    true_vf_for_pol = mdp_ref_obj1.get_value_func_dict(
        Policy({s: policy_func(s)
                for s in {1, 2, 3}}))
    print(true_vf_for_pol)

    opt_det_polf = adp_obj.get_optimal_policy_func_vi()

    # noinspection PyShadowingNames
    def opt_polf(s: S, opt_det_polf=opt_det_polf) -> Mapping[A, float]:
        return {opt_det_polf(s): 1.0}

    print("Printing Opt Policy")
    print(opt_polf(1))
    print(opt_polf(2))
    print(opt_polf(3))

    opt_vf = adp_obj.get_value_func_fa(opt_polf, False)
    print("Printing Opt VF")
示例#8
0
def get_uniform_policy(state_action_dict: Mapping[S, Set[A]]) -> Policy:
    return Policy(
        {s: {a: 1. / len(v)
             for a in v}
         for s, v in state_action_dict.items()})