示例#1
0
    def setUp(self):
        random.seed(42)

        self.finite_flip_flop = FlipFlop(0.7)

        self.finite_mdp = FiniteMarkovDecisionProcess({
            True: {
                True: Categorical({
                    (True, 1.0): 0.7,
                    (False, 2.0): 0.3
                }),
                False: Categorical({
                    (True, 1.0): 0.3,
                    (False, 2.0): 0.7
                }),
            },
            False: {
                True: Categorical({
                    (False, 1.0): 0.7,
                    (True, 2.0): 0.3
                }),
                False: Categorical({
                    (False, 1.0): 0.3,
                    (True, 2.0): 0.7
                }),
            },
        })
示例#2
0
def finite_horizon_MDP(
        process: FiniteMarkovDecisionProcess[S, A],
        limit: int) -> FiniteMarkovDecisionProcess[WithTime[S], A]:
    """Turn a normal FiniteMarkovDecisionProcess into one with a finite
    horizon that stops after 'limit' steps.

    Note that this makes the data representation of the process
    larger, since we end up having distinct sets and transitions for
    every single time step up to the limit.

    """
    mapping: Dict[WithTime[S], Optional[Dict[A,
                                             StateReward[WithTime[S]]]]] = {}

    # Non-terminal states
    for time in range(0, limit):
        for s in process.states():
            s_time = WithTime(state=s, time=time)
            actions_map = process.action_mapping(s)
            if actions_map is None:
                mapping[s_time] = None
            else:
                mapping[s_time] = {
                    a:
                    result.map(lambda s_r:
                               (WithTime(state=s_r[0], time=time + 1), s_r[1]))
                    for a, result in actions_map.items()
                }

    # Terminal states
    for s in process.states():
        mapping[WithTime(state=s, time=limit)] = None

    return FiniteMarkovDecisionProcess(mapping)
示例#3
0
def finite_horizon_MDP(
        process: FiniteMarkovDecisionProcess[S, A],
        limit: int) -> FiniteMarkovDecisionProcess[WithTime[S], A]:
    '''Turn a normal FiniteMarkovDecisionProcess into one with a finite
    horizon that stops after 'limit' steps.

    Note that this makes the data representation of the process
    larger, since we end up having distinct sets and transitions for
    every single time step up to the limit.

    '''
    mapping: Dict[WithTime[S], Dict[A, FiniteDistribution[Tuple[WithTime[S],
                                                                float]]]] = {}

    # Non-terminal states
    for time in range(0, limit):
        for s in process.non_terminal_states:
            s_time = WithTime(state=s.state, time=time)
            actions_map = process.action_mapping(s)
            mapping[s_time] = {
                a:
                result.map(lambda sr:
                           (WithTime(state=sr[0].state, time=time + 1), sr[1]))
                for a, result in actions_map.items()
            }

    return FiniteMarkovDecisionProcess(mapping)
示例#4
0
def sarsa_control_scratch(
        #traces: Iterable[Iterable[mp.TransitionStep[S]]],
        mdp_to_sample: FiniteMarkovDecisionProcess,
        states: List[S],
        actions: Mapping[S, List[A]],
        γ: float,
        num_episodes: float = 10000,
        eps: float = 0.1,
        base_lr: float = 0.03,
        half_life: float = 1000.0,
        exponent: float = 0.5) -> Mapping[S, float]:

    q: Mapping[Tuple[S, A], float] = {}
    counts_per_state_act: Mapping[Tuple[S, A], int] = {}
    for state in states:
        for action in actions[state]:
            q[(state, action)] = 0.
            counts_per_state_act[(state, action)] = 0
    policy_map: Mapping[S, Optional[Categorical[A]]] = {}
    for state in states:
        if actions[state] is None:
            policy_map[state] = None
        else:
            policy_map[state] = Categorical(
                {action: 1
                 for action in actions[state]})
    Pi: FinitePolicy[S, A] = FinitePolicy(policy_map)
    state = Categorical({state: 1 for state in states}).sample()
    for i in range(num_episodes):
        action_distribution = Pi.act(state)
        action = action_distribution.sample()
        next_distribution = mdp_to_sample.step(state, action)
        next_state, reward = next_distribution.sample()
        next_action = Pi.act(next_state).sample()
        counts_per_state_act[(state, action)] += 1
        alpha = base_lr / (1 + (
            (counts_per_state_act[(state, action)] - 1) / half_life)**exponent)
        #We choose the next action based on epsilon greedy policy
        q[(state,
           action)] += alpha * (reward + γ * q[(next_state, next_action)] -
                                q[(state, action)])
        new_pol: Mapping[S, Optional[Categorical[A]]] = Pi.policy_map
        if actions[state] is None:
            new_pol[state] = None
        policy_map = {
            action: eps / len(actions[state])
            for action in actions[state]
        }
        best_action = actions[state][0]
        for action in actions[state]:
            if q[(state, best_action)] <= q[(state, action)]:
                best_action = action
        policy_map[best_action] += 1 - eps
        new_pol[state] = Categorical(policy_map)
        Pi = FinitePolicy(new_pol)
        state = next_state
        if next_state is None:
            state = Categorical({state: 1 for state in states}).sample()
    return q
示例#5
0
 def get_finite_mdp(self) -> FiniteMarkovDecisionProcess[Cell, Move]:
     '''
     returns the FiniteMarkovDecision object for this windy grid problem
     '''
     return FiniteMarkovDecisionProcess(
         {s: self.get_transition_probabilities(s) for s in
          self.get_all_nt_states()}
     )
示例#6
0
 def get_finite_mdp(self) -> FiniteMarkovDecisionProcess[Cell, Move]:
     '''
     returns the FiniteMarkovDecision object for this windy grid problem
     '''
     d1: StateActionMapping[Cell, Move] = \
         {s: self.get_transition_probabilities(s) for s in
          self.get_all_nt_states()}
     d2: StateActionMapping[Cell, Move] = {s: None for s in self.terminals}
     return FiniteMarkovDecisionProcess({**d1, **d2})
示例#7
0
def initialize(
    mdp: FiniteMarkovDecisionProcess
) -> Tuple[V[S], FinitePolicy]:
    """Initialize value function and policy.

    Initialize the value function to zeros at each state, and initialize the
    policy to a random choice of the action space at each non-terminal state.

    :param mdp: Object representation of a finite Markov decision process
    :returns: Value function initialized at zeros for each state
    :returns: Random Initial policy
    """
    # Set value function at each state equal to zero
    v_0: V[S] = {s: 0 for s in mdp.states()}
    # Set the policy to be a random choice of the action space at each state
    pi_0: FinitePolicy[S, A] = FinitePolicy(
        {s: Choose(set(mdp.actions(s))) for s in mdp.non_terminal_states}
    )
    return v_0, pi_0
示例#8
0
def mc_control_scratch(
        #traces: Iterable[Iterable[mp.TransitionStep[S]]],
        mdp_to_sample: FiniteMarkovDecisionProcess,
        states: List[S],
        actions: Mapping[S, List[A]],
        γ: float,
        tolerance: float = 1e-6,
        num_episodes: float = 10000) -> Mapping[Tuple[S, A], float]:

    q: Mapping[Tuple[S, A], float] = {}
    counts_per_state_act: Mapping[Tuple[S, A], int] = {}
    for state in states:
        for action in actions[state]:
            q[(state, action)] = 0.
            counts_per_state_act[(state, action)] = 0
    policy_map: Mapping[S, Optional[Categorical[A]]] = {}
    for state in states:
        if actions[state] is None:
            policy_map[state] = None
        else:
            policy_map[state] = Categorical(
                {action: 1
                 for action in actions[state]})
    Pi: FinitePolicy[S, A] = FinitePolicy(policy_map)
    start_state_distrib = Categorical({state: 1 for state in states})
    for i in range(num_episodes):
        trace: Iterable[TransitionStep[S, A]] = mdp_to_sample.simulate_actions(
            start_state_distrib, Pi)
        episode = returns(trace, γ, tolerance)
        #print(episode)
        for step in episode:
            state = step.state
            action = step.action
            return_ = step.return_
            counts_per_state_act[(state, action)] += 1
            q[(state, action)] += 1 / counts_per_state_act[
                (state, action)] * (return_ - q[(state, action)])
        eps = 1 / (i + 1)
        new_pol: Mapping[S, Optional[Categorical[A]]] = {}
        for state in states:
            if actions[state] is None:
                new_pol[state] = None
            policy_map = {
                action: eps / len(actions[state])
                for action in actions[state]
            }
            best_action = actions[state][0]
            for action in actions[state]:
                if q[(state, best_action)] <= q[(state, action)]:
                    best_action = action
            policy_map[best_action] += 1 - eps
            new_pol[state] = Categorical(policy_map)
        Pi = FinitePolicy(new_pol)

    return q
示例#9
0
 def setUp(self):
     self.finite_flip_flop = FiniteMarkovDecisionProcess({
         True: {
             True: Categorical({(True, 1.0): 0.7, (False, 2.0): 0.3}),
             False: Categorical({(True, 1.0): 0.3, (False, 2.0): 0.7}),
         },
         False: {
             True: Categorical({(False, 1.0): 0.7, (True, 2.0): 0.3}),
             False: Categorical({(False, 1.0): 0.3, (True, 2.0): 0.7}),
         }
     })
示例#10
0
 def __init__(self, initial_inventory: int, time_steps: int,
              price_lambda_pairs: Sequence[Tuple[float, float]]):
     self.initial_inventory = initial_inventory
     self.time_steps = time_steps
     self.price_lambda_pairs = price_lambda_pairs
     distrs = [poisson(l) for _, l in price_lambda_pairs]
     prices = [p for p, _ in price_lambda_pairs]
     self.single_step_mdp: FiniteMarkovDecisionProcess[int, int] =\
         FiniteMarkovDecisionProcess({
             s: {i: Categorical(
                 {(s - k, prices[i] * k):
                  (distrs[i].pmf(k) if k < s else 1 - distrs[i].cdf(s - 1))
                  for k in range(s + 1)})
                 for i in range(len(prices))}
             for s in range(initial_inventory + 1)
         })
     self.mdp = finite_horizon_MDP(self.single_step_mdp, time_steps)
示例#11
0
def policy_iteration(
    mdp: FiniteMarkovDecisionProcess,
    gamma: float,
    tolerance: float,
    max_iters: int
) -> Tuple[V[S], FinitePolicy]:
    """Implement policy iteration on a finite MDP.

    :param mdp: Object representation of a finite Markov decision process
    :param gamma: Discount factor
    :param tolerance: Difference in maximum value functions between iterations
        for convergence
    :param max_iters: Maximum number of iterations to allow
    :returns: Optimal value function
    :returns: Optimal policy
    """
    vf, pi = initialize(mdp)
    n_iter = 0

    while True:

        n_iter += 1
        delta = 0
        v = vf.copy()
        mrp: FiniteMarkovRewardProcess[S] = mdp.apply_finite_policy(pi)

        # Policy evaluation
        vf: V[S] = {mrp.non_terminal_states[i]: v for i, v in enumerate(
            mrp.get_value_function_vec(gamma)
        )}
        diffs = np.absolute(np.subtract(list(vf.values()), list(v.values())))
        diffs = np.append(diffs, delta)
        delta = np.max(diffs)

        # Policy improvement
        pi: FinitePolicy[S, A] = dp.greedy_policy_from_vf(
            mdp, vf, gamma
        )

        if n_iter == max_iters:
            print("Maximum iterations reached.")
            return vf, pi
        if delta < tolerance:
            return vf, pi
示例#12
0
class TestEvaluate(unittest.TestCase):
    def setUp(self):
        random.seed(42)

        self.finite_flip_flop = FlipFlop(0.7)

        self.finite_mdp = FiniteMarkovDecisionProcess({
            True: {
                True: Categorical({
                    (True, 1.0): 0.7,
                    (False, 2.0): 0.3
                }),
                False: Categorical({
                    (True, 1.0): 0.3,
                    (False, 2.0): 0.7
                }),
            },
            False: {
                True: Categorical({
                    (False, 1.0): 0.7,
                    (True, 2.0): 0.3
                }),
                False: Categorical({
                    (False, 1.0): 0.3,
                    (True, 2.0): 0.7
                }),
            },
        })

    def test_evaluate_finite_mrp(self) -> None:
        start = Tabular(
            {s: 0.0
             for s in self.finite_flip_flop.states()},
            count_to_weight_func=lambda _: 0.1,
        )

        episode_length = 20
        episodes: Iterable[Iterable[
            mp.TransitionStep[bool]]] = self.finite_flip_flop.reward_traces(
                Choose({True, False}))
        transitions: Iterable[
            mp.TransitionStep[bool]] = itertools.chain.from_iterable(
                itertools.islice(episode, episode_length)
                for episode in episodes)

        vs = td.td_prediction(transitions, γ=0.99, approx_0=start)

        v: Optional[Tabular[bool]] = iterate.last(
            itertools.islice(cast(Iterator[Tabular[bool]], vs), 10000))

        if v is not None:
            self.assertEqual(len(v.values_map), 2)

            for s in v.values_map:
                # Intentionally loose bound—otherwise test is too slow.
                # Takes >1s on my machine otherwise.
                self.assertLess(abs(v(s) - 170), 3.0)
        else:
            assert False

    def test_evaluate_finite_mdp(self) -> None:
        q_0: Tabular[Tuple[bool, bool]] = Tabular(
            {(s, a): 0.0
             for s in self.finite_mdp.states()
             for a in self.finite_mdp.actions(s)},
            count_to_weight_func=lambda _: 0.1,
        )

        uniform_policy: mdp.Policy[bool, bool] = mdp.FinitePolicy({
            s: Choose(self.finite_mdp.actions(s))
            for s in self.finite_mdp.states()
        })

        transitions: Iterable[mdp.TransitionStep[
            bool, bool]] = self.finite_mdp.simulate_actions(
                Choose(self.finite_mdp.states()), uniform_policy)

        qs = td.td_control(transitions, self.finite_mdp.actions, q_0, γ=0.99)

        q: Optional[Tabular[Tuple[bool, bool]]] = iterate.last(
            cast(Iterator[Tabular[Tuple[bool, bool]]],
                 itertools.islice(qs, 20000)))

        if q is not None:
            self.assertEqual(len(q.values_map), 4)

            for s in [True, False]:
                self.assertLess(abs(q((s, False)) - 170.0), 2)
                self.assertGreater(q((s, False)), q((s, True)))
        else:
            assert False