def setUp(self): random.seed(42) self.finite_flip_flop = FlipFlop(0.7) self.finite_mdp = FiniteMarkovDecisionProcess({ True: { True: Categorical({ (True, 1.0): 0.7, (False, 2.0): 0.3 }), False: Categorical({ (True, 1.0): 0.3, (False, 2.0): 0.7 }), }, False: { True: Categorical({ (False, 1.0): 0.7, (True, 2.0): 0.3 }), False: Categorical({ (False, 1.0): 0.3, (True, 2.0): 0.7 }), }, })
def finite_horizon_MDP( process: FiniteMarkovDecisionProcess[S, A], limit: int) -> FiniteMarkovDecisionProcess[WithTime[S], A]: """Turn a normal FiniteMarkovDecisionProcess into one with a finite horizon that stops after 'limit' steps. Note that this makes the data representation of the process larger, since we end up having distinct sets and transitions for every single time step up to the limit. """ mapping: Dict[WithTime[S], Optional[Dict[A, StateReward[WithTime[S]]]]] = {} # Non-terminal states for time in range(0, limit): for s in process.states(): s_time = WithTime(state=s, time=time) actions_map = process.action_mapping(s) if actions_map is None: mapping[s_time] = None else: mapping[s_time] = { a: result.map(lambda s_r: (WithTime(state=s_r[0], time=time + 1), s_r[1])) for a, result in actions_map.items() } # Terminal states for s in process.states(): mapping[WithTime(state=s, time=limit)] = None return FiniteMarkovDecisionProcess(mapping)
def finite_horizon_MDP( process: FiniteMarkovDecisionProcess[S, A], limit: int) -> FiniteMarkovDecisionProcess[WithTime[S], A]: '''Turn a normal FiniteMarkovDecisionProcess into one with a finite horizon that stops after 'limit' steps. Note that this makes the data representation of the process larger, since we end up having distinct sets and transitions for every single time step up to the limit. ''' mapping: Dict[WithTime[S], Dict[A, FiniteDistribution[Tuple[WithTime[S], float]]]] = {} # Non-terminal states for time in range(0, limit): for s in process.non_terminal_states: s_time = WithTime(state=s.state, time=time) actions_map = process.action_mapping(s) mapping[s_time] = { a: result.map(lambda sr: (WithTime(state=sr[0].state, time=time + 1), sr[1])) for a, result in actions_map.items() } return FiniteMarkovDecisionProcess(mapping)
def sarsa_control_scratch( #traces: Iterable[Iterable[mp.TransitionStep[S]]], mdp_to_sample: FiniteMarkovDecisionProcess, states: List[S], actions: Mapping[S, List[A]], γ: float, num_episodes: float = 10000, eps: float = 0.1, base_lr: float = 0.03, half_life: float = 1000.0, exponent: float = 0.5) -> Mapping[S, float]: q: Mapping[Tuple[S, A], float] = {} counts_per_state_act: Mapping[Tuple[S, A], int] = {} for state in states: for action in actions[state]: q[(state, action)] = 0. counts_per_state_act[(state, action)] = 0 policy_map: Mapping[S, Optional[Categorical[A]]] = {} for state in states: if actions[state] is None: policy_map[state] = None else: policy_map[state] = Categorical( {action: 1 for action in actions[state]}) Pi: FinitePolicy[S, A] = FinitePolicy(policy_map) state = Categorical({state: 1 for state in states}).sample() for i in range(num_episodes): action_distribution = Pi.act(state) action = action_distribution.sample() next_distribution = mdp_to_sample.step(state, action) next_state, reward = next_distribution.sample() next_action = Pi.act(next_state).sample() counts_per_state_act[(state, action)] += 1 alpha = base_lr / (1 + ( (counts_per_state_act[(state, action)] - 1) / half_life)**exponent) #We choose the next action based on epsilon greedy policy q[(state, action)] += alpha * (reward + γ * q[(next_state, next_action)] - q[(state, action)]) new_pol: Mapping[S, Optional[Categorical[A]]] = Pi.policy_map if actions[state] is None: new_pol[state] = None policy_map = { action: eps / len(actions[state]) for action in actions[state] } best_action = actions[state][0] for action in actions[state]: if q[(state, best_action)] <= q[(state, action)]: best_action = action policy_map[best_action] += 1 - eps new_pol[state] = Categorical(policy_map) Pi = FinitePolicy(new_pol) state = next_state if next_state is None: state = Categorical({state: 1 for state in states}).sample() return q
def get_finite_mdp(self) -> FiniteMarkovDecisionProcess[Cell, Move]: ''' returns the FiniteMarkovDecision object for this windy grid problem ''' return FiniteMarkovDecisionProcess( {s: self.get_transition_probabilities(s) for s in self.get_all_nt_states()} )
def get_finite_mdp(self) -> FiniteMarkovDecisionProcess[Cell, Move]: ''' returns the FiniteMarkovDecision object for this windy grid problem ''' d1: StateActionMapping[Cell, Move] = \ {s: self.get_transition_probabilities(s) for s in self.get_all_nt_states()} d2: StateActionMapping[Cell, Move] = {s: None for s in self.terminals} return FiniteMarkovDecisionProcess({**d1, **d2})
def initialize( mdp: FiniteMarkovDecisionProcess ) -> Tuple[V[S], FinitePolicy]: """Initialize value function and policy. Initialize the value function to zeros at each state, and initialize the policy to a random choice of the action space at each non-terminal state. :param mdp: Object representation of a finite Markov decision process :returns: Value function initialized at zeros for each state :returns: Random Initial policy """ # Set value function at each state equal to zero v_0: V[S] = {s: 0 for s in mdp.states()} # Set the policy to be a random choice of the action space at each state pi_0: FinitePolicy[S, A] = FinitePolicy( {s: Choose(set(mdp.actions(s))) for s in mdp.non_terminal_states} ) return v_0, pi_0
def mc_control_scratch( #traces: Iterable[Iterable[mp.TransitionStep[S]]], mdp_to_sample: FiniteMarkovDecisionProcess, states: List[S], actions: Mapping[S, List[A]], γ: float, tolerance: float = 1e-6, num_episodes: float = 10000) -> Mapping[Tuple[S, A], float]: q: Mapping[Tuple[S, A], float] = {} counts_per_state_act: Mapping[Tuple[S, A], int] = {} for state in states: for action in actions[state]: q[(state, action)] = 0. counts_per_state_act[(state, action)] = 0 policy_map: Mapping[S, Optional[Categorical[A]]] = {} for state in states: if actions[state] is None: policy_map[state] = None else: policy_map[state] = Categorical( {action: 1 for action in actions[state]}) Pi: FinitePolicy[S, A] = FinitePolicy(policy_map) start_state_distrib = Categorical({state: 1 for state in states}) for i in range(num_episodes): trace: Iterable[TransitionStep[S, A]] = mdp_to_sample.simulate_actions( start_state_distrib, Pi) episode = returns(trace, γ, tolerance) #print(episode) for step in episode: state = step.state action = step.action return_ = step.return_ counts_per_state_act[(state, action)] += 1 q[(state, action)] += 1 / counts_per_state_act[ (state, action)] * (return_ - q[(state, action)]) eps = 1 / (i + 1) new_pol: Mapping[S, Optional[Categorical[A]]] = {} for state in states: if actions[state] is None: new_pol[state] = None policy_map = { action: eps / len(actions[state]) for action in actions[state] } best_action = actions[state][0] for action in actions[state]: if q[(state, best_action)] <= q[(state, action)]: best_action = action policy_map[best_action] += 1 - eps new_pol[state] = Categorical(policy_map) Pi = FinitePolicy(new_pol) return q
def setUp(self): self.finite_flip_flop = FiniteMarkovDecisionProcess({ True: { True: Categorical({(True, 1.0): 0.7, (False, 2.0): 0.3}), False: Categorical({(True, 1.0): 0.3, (False, 2.0): 0.7}), }, False: { True: Categorical({(False, 1.0): 0.7, (True, 2.0): 0.3}), False: Categorical({(False, 1.0): 0.3, (True, 2.0): 0.7}), } })
def __init__(self, initial_inventory: int, time_steps: int, price_lambda_pairs: Sequence[Tuple[float, float]]): self.initial_inventory = initial_inventory self.time_steps = time_steps self.price_lambda_pairs = price_lambda_pairs distrs = [poisson(l) for _, l in price_lambda_pairs] prices = [p for p, _ in price_lambda_pairs] self.single_step_mdp: FiniteMarkovDecisionProcess[int, int] =\ FiniteMarkovDecisionProcess({ s: {i: Categorical( {(s - k, prices[i] * k): (distrs[i].pmf(k) if k < s else 1 - distrs[i].cdf(s - 1)) for k in range(s + 1)}) for i in range(len(prices))} for s in range(initial_inventory + 1) }) self.mdp = finite_horizon_MDP(self.single_step_mdp, time_steps)
def policy_iteration( mdp: FiniteMarkovDecisionProcess, gamma: float, tolerance: float, max_iters: int ) -> Tuple[V[S], FinitePolicy]: """Implement policy iteration on a finite MDP. :param mdp: Object representation of a finite Markov decision process :param gamma: Discount factor :param tolerance: Difference in maximum value functions between iterations for convergence :param max_iters: Maximum number of iterations to allow :returns: Optimal value function :returns: Optimal policy """ vf, pi = initialize(mdp) n_iter = 0 while True: n_iter += 1 delta = 0 v = vf.copy() mrp: FiniteMarkovRewardProcess[S] = mdp.apply_finite_policy(pi) # Policy evaluation vf: V[S] = {mrp.non_terminal_states[i]: v for i, v in enumerate( mrp.get_value_function_vec(gamma) )} diffs = np.absolute(np.subtract(list(vf.values()), list(v.values()))) diffs = np.append(diffs, delta) delta = np.max(diffs) # Policy improvement pi: FinitePolicy[S, A] = dp.greedy_policy_from_vf( mdp, vf, gamma ) if n_iter == max_iters: print("Maximum iterations reached.") return vf, pi if delta < tolerance: return vf, pi
class TestEvaluate(unittest.TestCase): def setUp(self): random.seed(42) self.finite_flip_flop = FlipFlop(0.7) self.finite_mdp = FiniteMarkovDecisionProcess({ True: { True: Categorical({ (True, 1.0): 0.7, (False, 2.0): 0.3 }), False: Categorical({ (True, 1.0): 0.3, (False, 2.0): 0.7 }), }, False: { True: Categorical({ (False, 1.0): 0.7, (True, 2.0): 0.3 }), False: Categorical({ (False, 1.0): 0.3, (True, 2.0): 0.7 }), }, }) def test_evaluate_finite_mrp(self) -> None: start = Tabular( {s: 0.0 for s in self.finite_flip_flop.states()}, count_to_weight_func=lambda _: 0.1, ) episode_length = 20 episodes: Iterable[Iterable[ mp.TransitionStep[bool]]] = self.finite_flip_flop.reward_traces( Choose({True, False})) transitions: Iterable[ mp.TransitionStep[bool]] = itertools.chain.from_iterable( itertools.islice(episode, episode_length) for episode in episodes) vs = td.td_prediction(transitions, γ=0.99, approx_0=start) v: Optional[Tabular[bool]] = iterate.last( itertools.islice(cast(Iterator[Tabular[bool]], vs), 10000)) if v is not None: self.assertEqual(len(v.values_map), 2) for s in v.values_map: # Intentionally loose bound—otherwise test is too slow. # Takes >1s on my machine otherwise. self.assertLess(abs(v(s) - 170), 3.0) else: assert False def test_evaluate_finite_mdp(self) -> None: q_0: Tabular[Tuple[bool, bool]] = Tabular( {(s, a): 0.0 for s in self.finite_mdp.states() for a in self.finite_mdp.actions(s)}, count_to_weight_func=lambda _: 0.1, ) uniform_policy: mdp.Policy[bool, bool] = mdp.FinitePolicy({ s: Choose(self.finite_mdp.actions(s)) for s in self.finite_mdp.states() }) transitions: Iterable[mdp.TransitionStep[ bool, bool]] = self.finite_mdp.simulate_actions( Choose(self.finite_mdp.states()), uniform_policy) qs = td.td_control(transitions, self.finite_mdp.actions, q_0, γ=0.99) q: Optional[Tabular[Tuple[bool, bool]]] = iterate.last( cast(Iterator[Tabular[Tuple[bool, bool]]], itertools.islice(qs, 20000))) if q is not None: self.assertEqual(len(q.values_map), 4) for s in [True, False]: self.assertLess(abs(q((s, False)) - 170.0), 2) self.assertGreater(q((s, False)), q((s, True))) else: assert False