def finite_horizon_MRP(process: FiniteMarkovRewardProcess[S], limit: int) -> FiniteMarkovRewardProcess[WithTime[S]]: """Turn a normal FiniteMarkovRewardProcess into one with a finite horizon that stops after 'limit' steps. Note that this makes the data representation of the process larger, since we end up having distinct sets and transitions for every single time step up to the limit. """ transition_map: Dict[WithTime[S], Optional[RewardOutcome]] = {} # Non-terminal states for time in range(0, limit): for s in process.states(): result: Optional[StateReward[S]] = process.transition_reward(s) s_time = WithTime(state=s, time=time) transition_map[s_time] = (None if result is None else result.map( lambda s_r: (WithTime(state=s_r[0], time=time + 1), s_r[1]))) # Terminal states for s in process.states(): transition_map[WithTime(state=s, time=limit)] = None return FiniteMarkovRewardProcess(transition_map)
def apply_finite_policy(self, policy: FinitePolicy[S, A])\ -> FiniteMarkovRewardProcess[S]: transition_mapping: Dict[S, FiniteDistribution[Tuple[S, float]]] = {} for state in self.mapping: action_map: ActionMapping[A, S] = self.mapping[state] outcomes: DefaultDict[Tuple[S, float], float]\ = defaultdict(float) actions = policy.act(state) for action, p_action in actions: for (s1, r), p in action_map[action].table().items(): outcomes[(s1.state, r)] += p_action * p transition_mapping[state.state] = Categorical(outcomes) return FiniteMarkovRewardProcess(transition_mapping)
def finite_mrp( fixed_experiences: Sequence[TransitionStep[S]] ) -> FiniteMarkovRewardProcess[S]: def by_state(tr: TransitionStep[S]) -> S: return tr.state.state d: Mapping[S, Sequence[Tuple[S, float]]] = \ {s: [(t.next_state.state, t.reward) for t in l] for s, l in itertools.groupby( sorted(fixed_experiences, key=by_state), key=by_state )} mrp: Dict[S, Categorical[Tuple[S, float]]] = \ {s: Categorical({x: y / len(l) for x, y in collections.Counter(l).items()}) for s, l in d.items()} return FiniteMarkovRewardProcess(mrp)
def apply_finite_policy( self, policy: FinitePolicy[S, A]) -> FiniteMarkovRewardProcess[S]: transition_mapping: Dict[S, Optional[StateReward[S]]] = {} for state in self.mapping: action_map: Optional[ActionMapping[A, S]] = self.mapping[state] if action_map is None: transition_mapping[state] = None else: outcomes: DefaultDict[Tuple[S, float], float] = defaultdict(float) actions = policy.act(state) if actions is not None: for action, p_action in actions: for outcome, p_state_reward in action_map[action]: outcomes[outcome] += p_action * p_state_reward transition_mapping[state] = Categorical(outcomes) return FiniteMarkovRewardProcess(transition_mapping)
def finite_horizon_MRP(process: FiniteMarkovRewardProcess[S], limit: int) -> FiniteMarkovRewardProcess[WithTime[S]]: '''Turn a normal FiniteMarkovRewardProcess into one with a finite horizon that stops after 'limit' steps. Note that this makes the data representation of the process larger, since we end up having distinct sets and transitions for every single time step up to the limit. ''' transition_map: Dict[WithTime[S], RewardOutcome] = {} # Non-terminal states for time in range(limit): for s in process.non_terminal_states: result: StateReward[S] = process.transition_reward(s) s_time = WithTime(state=s.state, time=time) transition_map[s_time] = result.map( lambda sr: (WithTime(state=sr[0].state, time=time + 1), sr[1])) return FiniteMarkovRewardProcess(transition_map)
def __init__(self, sl_mapping: SLMapping): SnakesLaddersFMP.__init__(self, sl_mapping) FiniteMarkovRewardProcess.__init__(self, self.get_transition_reward_map())