예제 #1
0
def finite_horizon_MRP(process: FiniteMarkovRewardProcess[S],
                       limit: int) -> FiniteMarkovRewardProcess[WithTime[S]]:
    """Turn a normal FiniteMarkovRewardProcess into one with a finite horizon
    that stops after 'limit' steps.

    Note that this makes the data representation of the process
    larger, since we end up having distinct sets and transitions for
    every single time step up to the limit.

    """
    transition_map: Dict[WithTime[S], Optional[RewardOutcome]] = {}

    # Non-terminal states
    for time in range(0, limit):

        for s in process.states():
            result: Optional[StateReward[S]] = process.transition_reward(s)
            s_time = WithTime(state=s, time=time)

            transition_map[s_time] = (None if result is None else result.map(
                lambda s_r: (WithTime(state=s_r[0], time=time + 1), s_r[1])))

    # Terminal states
    for s in process.states():
        transition_map[WithTime(state=s, time=limit)] = None

    return FiniteMarkovRewardProcess(transition_map)
예제 #2
0
    def apply_finite_policy(self, policy: FinitePolicy[S, A])\
            -> FiniteMarkovRewardProcess[S]:

        transition_mapping: Dict[S, FiniteDistribution[Tuple[S, float]]] = {}

        for state in self.mapping:
            action_map: ActionMapping[A, S] = self.mapping[state]
            outcomes: DefaultDict[Tuple[S, float], float]\
                = defaultdict(float)
            actions = policy.act(state)
            for action, p_action in actions:
                for (s1, r), p in action_map[action].table().items():
                    outcomes[(s1.state, r)] += p_action * p

            transition_mapping[state.state] = Categorical(outcomes)

        return FiniteMarkovRewardProcess(transition_mapping)
예제 #3
0
def finite_mrp(
    fixed_experiences: Sequence[TransitionStep[S]]
) -> FiniteMarkovRewardProcess[S]:
    def by_state(tr: TransitionStep[S]) -> S:
        return tr.state.state

    d: Mapping[S, Sequence[Tuple[S, float]]] = \
        {s: [(t.next_state.state, t.reward) for t in l] for s, l in
         itertools.groupby(
             sorted(fixed_experiences, key=by_state),
             key=by_state
         )}
    mrp: Dict[S, Categorical[Tuple[S, float]]] = \
        {s: Categorical({x: y / len(l) for x, y in
                         collections.Counter(l).items()})
         for s, l in d.items()}
    return FiniteMarkovRewardProcess(mrp)
예제 #4
0
    def apply_finite_policy(
            self, policy: FinitePolicy[S, A]) -> FiniteMarkovRewardProcess[S]:

        transition_mapping: Dict[S, Optional[StateReward[S]]] = {}

        for state in self.mapping:
            action_map: Optional[ActionMapping[A, S]] = self.mapping[state]

            if action_map is None:
                transition_mapping[state] = None
            else:
                outcomes: DefaultDict[Tuple[S, float],
                                      float] = defaultdict(float)

                actions = policy.act(state)
                if actions is not None:
                    for action, p_action in actions:
                        for outcome, p_state_reward in action_map[action]:
                            outcomes[outcome] += p_action * p_state_reward

                transition_mapping[state] = Categorical(outcomes)

        return FiniteMarkovRewardProcess(transition_mapping)
예제 #5
0
def finite_horizon_MRP(process: FiniteMarkovRewardProcess[S],
                       limit: int) -> FiniteMarkovRewardProcess[WithTime[S]]:
    '''Turn a normal FiniteMarkovRewardProcess into one with a finite horizon
    that stops after 'limit' steps.

    Note that this makes the data representation of the process
    larger, since we end up having distinct sets and transitions for
    every single time step up to the limit.

    '''
    transition_map: Dict[WithTime[S], RewardOutcome] = {}

    # Non-terminal states
    for time in range(limit):

        for s in process.non_terminal_states:
            result: StateReward[S] = process.transition_reward(s)
            s_time = WithTime(state=s.state, time=time)

            transition_map[s_time] = result.map(
                lambda sr: (WithTime(state=sr[0].state, time=time + 1), sr[1]))

    return FiniteMarkovRewardProcess(transition_map)
예제 #6
0
파일: prob4.py 프로젝트: lkourti/RL-book
 def __init__(self, sl_mapping: SLMapping):
     SnakesLaddersFMP.__init__(self, sl_mapping)
     FiniteMarkovRewardProcess.__init__(self, self.get_transition_reward_map())