示例#1
0
    def to_mrp(self, pi: Policy) -> MRP:

        # the goal here is to produce the input to the MRP constructor
        mrp_input = {}
        for state in self.s_a_s_.keys():
            output_states = set()
            output_reward = 0
            for action in pi.get_actions(state).keys():
                output_states = output_states.union(
                    set(self.s_a_s_[state][action].keys()))
                output_reward += self.s_a_r_[state][action] * pi.get_prob(
                    state, action)

            output_probs = {}

            for state2 in output_states:
                for action in pi.get_actions(state).keys():
                    if state2 in self.s_a_s_[state][action].keys():
                        if state2 in output_probs.keys():
                            output_probs[state2] += self.s_a_s_[state][action][
                                state2] * pi.get_prob(state, action)
                        else:
                            output_probs[state2] = self.s_a_s_[state][action][
                                state2] * pi.get_prob(state, action)

            mrp_input[state] = (output_probs, output_reward)

        return MRP(mrp_input, self.gamma_)
示例#2
0
def get_soft_policy_from_qf_dict(qf_dict: SAf, softmax: bool,
                                 epsilon: float) -> Policy:
    if softmax:
        ret = Policy(
            {s: get_softmax_action_probs(v)
             for s, v in qf_dict.items()})
    else:
        ret = Policy({
            s: get_epsilon_action_probs(v, epsilon)
            for s, v in qf_dict.items()
        })
    return ret
示例#3
0
    def get_value_func_dict(self, pol: Policy) -> VFType:
        sa_dict = self.mdp_rep.state_action_dict
        vf_dict = {s: 0. for s in sa_dict.keys()}
        act_gen_dict = {s: get_rv_gen_func_single(pol.get_state_probabilities(s))
                        for s in sa_dict.keys()}
        episodes = 0
        updates = 0

        while episodes < self.num_episodes:
            et_dict = {s: 0. for s in sa_dict.keys()}
            state = self.mdp_rep.init_state_gen()
            steps = 0
            terminate = False

            while not terminate:
                action = act_gen_dict[state]()
                next_state, reward =\
                    self.mdp_rep.state_reward_gen_dict[state][action]()
                delta = reward + self.mdp_rep.gamma * vf_dict[next_state] -\
                    vf_dict[state]
                et_dict[state] += 1
                alpha = self.learning_rate * (updates / self.learning_rate_decay
                                              + 1) ** -0.5
                for s in sa_dict.keys():
                    vf_dict[s] += alpha * delta * et_dict[s]
                    et_dict[s] *= self.gamma_lambda
                updates += 1
                steps += 1
                terminate = steps >= self.max_steps or\
                    state in self.mdp_rep.terminal_states
                state = next_state

            episodes += 1

        return vf_dict
示例#4
0
def get_vf_dict_from_qf_dict_and_policy(qf_dict: SAf,
                                        pol: Policy) -> Mapping[A, float]:
    return {
        s:
        sum(pol.get_state_action_probability(s, a) * q for a, q in v.items())
        for s, v in qf_dict.items()
    }
示例#5
0
def policy_improve(mdp: MDP, vf: VF) -> Policy:
    
    new_pol = {}

    # for each state
    for state in mdp.states_:

        # find maximizing action
        max_val = float('-inf')
        max_action = []

        # iterate across actions
        for action in mdp.s_a_s_[state].keys():
            action_val = 0

            # find the expected action value
            for state2 in mdp.s_a_s_[state][action].keys():
                action_val += mdp.s_a_s_[state][action][state2][0] * 
                (mdp.s_a_s_[state][action][state2][1] + 
                mdp.gamma_ * vf.value_dict_[state2])

            # update if new max
            if action_val > max_val:
                max_val = action_val
                max_action = [action]
            elif action_val == max_val:
                max_action.append(action)
            
        actions = {}
        for action in max_action:
            actions[action] = 1.0 / len(max_action)

        new_pol[state] = actions
    
    return Policy(new_pol)
示例#6
0
    def get_mc_path(
        self,
        pol: Policy,
        start_state: S,
        start_action: Optional[A] = None,
    ) -> Sequence[Tuple[S, A, float, bool]]:

        res = []
        next_state = start_state
        steps = 0
        terminate = False
        occ_states = set()
        act_gen_dict = {
            s: get_rv_gen_func_single(pol.get_state_probabilities(s))
            for s in self.mdp_rep.state_action_dict.keys()
        }

        while not terminate:
            state = next_state
            first = state not in occ_states
            occ_states.add(state)
            action = act_gen_dict[state]()\
                if (steps > 0 or start_action is None) else start_action
            next_state, reward =\
                self.mdp_rep.state_reward_gen_dict[state][action]()
            res.append((state, action, reward, first))
            steps += 1
            terminate = steps >= self.max_steps or\
                state in self.mdp_rep.terminal_states
        return res
示例#7
0
    def get_value_func_dict(self, pol: Policy) -> VFType:
        sa_dict = self.mdp_rep.state_action_dict
        vf_dict = {s: 0.0 for s in sa_dict.keys()}
        act_gen_dict = {
            s: get_rv_gen_func_single(pol.get_state_probabilities(s))
            for s in sa_dict.keys()
        }
        episodes = 0

        while episodes < self.num_episodes:
            state = self.mdp_rep.init_state_gen()
            steps = 0
            terminate = False

            while not terminate:
                action = act_gen_dict[state]()
                next_state, reward = \
                    self.mdp_rep.state_reward_gen_dict[state][action]()
                vf_dict[state] += self.alpha * \
                    (reward + self.mdp_rep.gamma * vf_dict[next_state] -
                     vf_dict[state])
                state = next_state
                steps += 1
                terminate = steps >= self.max_steps or \
                    state in self.mdp_rep.terminal_states

            episodes += 1

        return vf_dict
示例#8
0
 def get_optimal_policy(self, tol=1e-4) -> DetPolicy:
     pol = Policy({s: {a: 1. / len(v) for a in v} for s, v in
                   self.state_action_dict.items()})
     vf = self.get_value_func_dict(pol)
     epsilon = tol * 1e4
     while epsilon >= tol:
         pol = self.get_improved_policy(pol)
         new_vf = self.get_value_func_dict(pol)
         epsilon = max(abs(new_vf[s] - v) for s, v in vf.items())
         vf = new_vf
     return pol
示例#9
0
    def get_value_func(self, polf: PolicyType) -> Callable[[S], float]:
        pol = Policy({
            s: get_pdf_from_samples(
                polf(s)(len(v) * TabularBase.NUM_SAMPLES_PER_ACTION))
            for s, v in self.get_state_action_dict().items()
        })

        # noinspection PyShadowingNames
        def vf(state: S, pol=pol) -> float:
            return self.get_value_func_dict(pol)[state]

        return vf
示例#10
0
    def get_value_func(self, pol_func: Callable[[S], Callable[[A], float]])\
            -> Callable[[S], float]:
        pol = Policy({
            s: {a: pol_func(s)(a)
                for a in v}
            for s, v in self.get_state_action_dict()
        })

        # noinspection PyShadowingNames
        def vf(state: S, pol=pol) -> float:
            return self.get_value_func_dict(pol)[state]

        return vf
示例#11
0
    def get_act_value_func(self, pol_func: Callable[[S], Callable[[A], float]])\
            -> Callable[[S], Callable[[A], float]]:
        pol = Policy({
            s: {a: pol_func(s)(a)
                for a in v}
            for s, v in self.get_state_action_dict()
        })

        # noinspection PyShadowingNames
        def qvf(state: S, pol=pol) -> Callable[[A], float]:

            # noinspection PyShadowingNames
            def inner_f(action: A, pol=pol, state=state) -> float:
                return self.get_act_value_func_dict(pol)[state][action]

            return inner_f

        return qvf
示例#12
0
 def get_optimal_policy(self, tol=1e-4) -> DetPolicy:
     # Initialize a policy
     pol = Policy({s: {a: 1. / len(v) for a in v} for s, v in
                   self.state_action_dict.items()})
     # Initialize a value function vector by rolling out with the initialized policy
     vf = self.get_value_func_dict(pol)
     # Convergence criterion
     epsilon = tol * 1e4
     while epsilon >= tol:
         # Policy improvement - maximize value function with actions over each state
         # and use this to form the new improved policy
         pol = self.get_improved_policy(pol)
         # Run the improved policy out to get new value function
         new_vf = self.get_value_func_dict(pol)
         # Check convergence
         epsilon = max(abs(new_vf[s] - v) for s, v in vf.items())
         vf = new_vf
     return pol
示例#13
0
def policy_improve(mdp: MDP, vf: VF) -> Policy:

    new_pol = {}
    for state in mdp.states_:
        max_val = float('-inf')
        max_action = []
        for action in mdp.s_a_s_[state].keys():
            action_val = 0
            for state2 in mdp.s_a_s_[state][action].keys():
                action_val += mdp.s_a_s_[state][action][state2] * (
                    mdp.s_a_r_[state][action] +
                    mdp.gamma_ * vf.get_value(state2))
            if action_val > max_val:
                max_val = action_val
                max_action = [action]
            elif action_val == max_val:
                max_action.append(action)
        actions = {}
        for action in max_action:
            actions[action] = 1.0 / len(max_action)
        new_pol[state] = actions

    return Policy(new_pol)
示例#14
0
                3: (0.4, -8.2)
            }
        },
        3: {
            'a': {
                3: (1.0, 0.0)
            },
            'b': {
                3: (1.0, 0.0)
            }
        }
    }
    mdp2_obj = MDPRefined(mdp_refined_data, 0.97)
    policy_data = {
        1: {
            'a': 0.4,
            'b': 0.6
        },
        2: {
            'a': 0.7,
            'c': 0.3
        },
        3: {
            'b': 1.0
        }
    }
    pol_obj = Policy(policy_data)
    mrp_refined_obj = mdp2_obj.get_mrp_refined(pol_obj)
    print(mrp_refined_obj.transitions)
    print(mrp_refined_obj.rewards_refined)
示例#15
0
            raise ValueError
        return ret

    def pf_as_policy_type(i: int) -> Callable[[int], Sequence[str]]:
        return get_sampling_func_from_prob_dict(policy_func(i))

    this_qf = adp_pg_obj.get_act_value_func(pf_as_policy_type)
    this_vf = adp_pg_obj.get_value_func(pf_as_policy_type)
    print("Printing vf for a policy")
    print(this_vf(1))
    print(this_vf(2))
    print(this_vf(3))
    print("Printing DP vf for a policy")
    from processes.policy import Policy
    true_vf_for_pol = mdp_ref_obj1.get_value_func_dict(
        Policy({s: policy_func(s)
                for s in {1, 2, 3}}))
    print(true_vf_for_pol)

    opt_det_polf = adp_pg_obj.get_optimal_det_policy_func()

    # noinspection PyShadowingNames
    def opt_polf(s: S, opt_det_polf=opt_det_polf) -> Mapping[A, float]:
        return {opt_det_polf(s): 1.0}

    print("Printing Opt Policy")
    print(opt_polf(1))
    print(opt_polf(2))
    print(opt_polf(3))

    opt_vf = adp_pg_obj.get_value_func(adp_pg_obj.get_policy_as_policy_type())
    print("Printing Opt VF")
示例#16
0
def get_epsilon_policy_from_qf(qf_dict: Mapping[S, Mapping[A, float]],
                               epsilon: float) -> Policy:
    return Policy(
        {s: get_epsilon_action_probs(v, epsilon)
         for s, v in qf_dict.items()})
示例#17
0
def get_softmax_policy_from_qf(
        qf_dict: Mapping[S, Mapping[A, float]]) -> Policy:
    return Policy({s: get_softmax_action_probs(v) for s, v in qf_dict.items()})
示例#18
0
        13: {
            'n': 0.25,
            's': 0.25,
            'w': 0.25,
            'e': 0.25
        },
        14: {
            'n': 0.25,
            's': 0.25,
            'w': 0.25,
            'e': 0.25
        },
        15: {
            'n': 0.25,
            's': 0.25,
            'w': 0.25,
            'e': 0.25
        },
        0: {
            'n': 0.25,
            's': 0.25,
            'w': 0.25,
            'e': 0.25
        },
    }

    policy = Policy(policy_data)
    vf = policy_eval(mdp, policy, 0.001)

    print(value_iter(mdp, vf))
示例#19
0
            ret = {'b': 1.0}
        else:
            raise ValueError
        return ret


    this_qf = adp_obj.get_act_value_func_fa(policy_func, True)
    this_vf = adp_obj.get_value_func_fa(policy_func, True)
    print("Printing vf for a policy")
    print(this_vf(1))
    print(this_vf(2))
    print(this_vf(3))
    print("Printing DP vf for a policy")
    from processes.policy import Policy
    true_vf_for_pol = mdp_ref_obj1.get_value_func_dict(Policy(
        {s: policy_func(s) for s in {1, 2, 3}}
    ))
    print(true_vf_for_pol)

    opt_det_polf = adp_obj.get_optimal_policy_func_vi()

    # noinspection PyShadowingNames
    def opt_polf(s: S, opt_det_polf=opt_det_polf) -> Mapping[A, float]:
        return {opt_det_polf(s): 1.0}

    print("Printing Opt Policy")
    print(opt_polf(1))
    print(opt_polf(2))
    print(opt_polf(3))

    opt_vf = adp_obj.get_value_func_fa(opt_polf, False)
示例#20
0
            'n': 0.25,
            's': 0.25,
            'w': 0.25,
            'e': 0.25
        },
        14: {
            'n': 0.25,
            's': 0.25,
            'w': 0.25,
            'e': 0.25
        },
        15: {
            'n': 0.25,
            's': 0.25,
            'w': 0.25,
            'e': 0.25
        },
        0: {
            'n': 0.25,
            's': 0.25,
            'w': 0.25,
            'e': 0.25
        },
    }

    pol = Policy(policy_data)
    #vf = policy_eval(mdp, pol, 0.001)
    #new_pol = policy_improve(mdp, pol, vf)
    #print(new_pol)
    vf, pol = policy_iter(mdp, pol, 0.001)
示例#21
0
def get_uniform_policy(state_action_dict: Mapping[S, Set[A]]) -> Policy:
    return Policy(
        {s: {a: 1. / len(v)
             for a in v}
         for s, v in state_action_dict.items()})