Пример #1
0
    def get_q_learning_vf_and_policy(
        self,
        states_actions_dict: Mapping[Cell, Set[Move]],
        sample_func: Callable[[Cell, Move], Tuple[Cell, float]],
        episodes: int = 10000,
        step_size: float = 0.01,
        epsilon: float = 0.1
    ) -> Tuple[V[Cell], FiniteDeterministicPolicy[Cell, Move]]:
        '''
        states_actions_dict gives us the set of possible moves from
        a non-block cell.
        sample_func is a function with two inputs: state and action,
        and with output as a sampled pair of (next_state, reward).
        '''
        q: Dict[Cell, Dict[Move, float]] = \
            {s: {a: 0. for a in actions} for s, actions in
             states_actions_dict.items()}
        nt_states: CellSet = {s for s in q}
        uniform_states: Choose[Cell] = Choose(nt_states)
        for episode_num in range(episodes):
            state: Cell = uniform_states.sample()
            '''
            write your code here
            update the dictionary q initialized above according
            to the Q-learning algorithm's Q-Value Function updates.
            '''

        vf_dict: V[Cell] = {NonTerminal(s): max(d.values()) for s, d
                            in q.items()}
        policy: FiniteDeterministicPolicy[Cell, Move] = \
            FiniteDeterministicPolicy(
                {s: max(d.items(), key=itemgetter(1))[0] for s, d in q.items()}
            )
        return (vf_dict, policy)
Пример #2
0
    def setUp(self):
        ii = 10
        self.steps = 6
        pairs = [(1.0, 0.5), (0.7, 1.0), (0.5, 1.5), (0.3, 2.5)]
        self.cp: ClearancePricingMDP = ClearancePricingMDP(
            initial_inventory=ii,
            time_steps=self.steps,
            price_lambda_pairs=pairs)

        def policy_func(x: int) -> int:
            return 0 if x < 2 else (1 if x < 5 else (2 if x < 8 else 3))

        stationary_policy: FiniteDeterministicPolicy[int, int] = \
            FiniteDeterministicPolicy(
                {s: policy_func(s) for s in range(ii + 1)}
            )

        self.single_step_mrp: FiniteMarkovRewardProcess[int] = \
            self.cp.single_step_mdp.apply_finite_policy(stationary_policy)

        self.mrp_seq = unwrap_finite_horizon_MRP(
            finite_horizon_MRP(self.single_step_mrp, self.steps))

        self.single_step_mdp: FiniteMarkovDecisionProcess[int, int] = \
            self.cp.single_step_mdp

        self.mdp_seq = unwrap_finite_horizon_MDP(
            finite_horizon_MDP(self.single_step_mdp, self.steps))
Пример #3
0
    def setUp(self):
        user_capacity = 2
        user_poisson_lambda = 1.0
        user_holding_cost = 1.0
        user_stockout_cost = 10.0

        self.gamma = 0.9

        self.si_mdp: FiniteMarkovDecisionProcess[InventoryState, int] =\
            SimpleInventoryMDPCap(
                capacity=user_capacity,
                poisson_lambda=user_poisson_lambda,
                holding_cost=user_holding_cost,
                stockout_cost=user_stockout_cost
            )

        self.fdp: FiniteDeterministicPolicy[InventoryState, int] = \
            FiniteDeterministicPolicy(
                {InventoryState(alpha, beta): user_capacity - (alpha + beta)
                 for alpha in range(user_capacity + 1)
                 for beta in range(user_capacity + 1 - alpha)}
        )

        self.implied_mrp: FiniteMarkovRewardProcess[InventoryState] =\
            self.si_mdp.apply_finite_policy(self.fdp)

        self.states: Sequence[NonTerminal[InventoryState]] = \
            self.implied_mrp.non_terminal_states
Пример #4
0
def get_vf_and_policy_from_qvf(
    mdp: FiniteMarkovDecisionProcess[S, A], qvf: QValueFunctionApprox[S, A]
) -> Tuple[V[S], FiniteDeterministicPolicy[S, A]]:
    opt_vf: V[S] = {
        s: max(qvf((s, a)) for a in mdp.actions(s))
        for s in mdp.non_terminal_states
    }
    opt_policy: FiniteDeterministicPolicy[S, A] = \
        FiniteDeterministicPolicy({
            s.state: qvf.argmax((s, a) for a in mdp.actions(s))[1]
            for s in mdp.non_terminal_states
        })
    return opt_vf, opt_policy
Пример #5
0
def greedy_policy_from_vf(mdp: FiniteMarkovDecisionProcess[S, A], vf: V[S],
                          gamma: float) -> FiniteDeterministicPolicy[S, A]:
    greedy_policy_dict: Dict[S, A] = {}

    for s in mdp.non_terminal_states:
        q_values: Iterator[Tuple[A, float]] = \
            ((a, mdp.mapping[s][a].expectation(
                lambda s_r: s_r[1] + gamma * extended_vf(vf, s_r[0])
            )) for a in mdp.actions(s))
        greedy_policy_dict[s.state] = \
            max(q_values, key=operator.itemgetter(1))[0]

    return FiniteDeterministicPolicy(greedy_policy_dict)
Пример #6
0
    def get_sarsa_vf_and_policy(
        self,
        states_actions_dict: Mapping[Cell, Set[Move]],
        sample_func: Callable[[Cell, Move], Tuple[Cell, float]],
        episodes: int = 10000,
        step_size: float = 0.01
    ) -> Tuple[V[Cell], FiniteDeterministicPolicy[Cell, Move]]:
        '''
        states_actions_dict gives us the set of possible moves from
        a non-terminal cell.
        sample_func is a function with two inputs: state and action,
        and with output as a sampled pair of (next_state, reward).
        '''
        q: Dict[Cell, Dict[Move, float]] = \
            {s: {a: 0. for a in actions} for s, actions in
             states_actions_dict.items()}
        nt_states: CellSet = {s for s in q}
        uniform_states: Choose[Cell] = Choose(nt_states)
        for episode_num in range(episodes):
            epsilon: float = 1.0 / (episode_num + 1)
            state: Cell = uniform_states.sample()
            action: Move = WindyGrid.epsilon_greedy_action(state, q, epsilon)
            while state in nt_states:
                next_state, reward = sample_func(state, action)
                if next_state in nt_states:
                    next_action: Move = WindyGrid.epsilon_greedy_action(
                        next_state, q, epsilon)
                    q[state][action] += step_size * \
                        (reward + q[next_state][next_action] -
                         q[state][action])
                    action = next_action
                else:
                    q[state][action] += step_size * (reward - q[state][action])
                state = next_state

        vf_dict: V[Cell] = {
            NonTerminal(s): max(d.values())
            for s, d in q.items()
        }
        policy: FiniteDeterministicPolicy[Cell, Move] = \
            FiniteDeterministicPolicy(
                {s: max(d.items(), key=itemgetter(1))[0] for s, d in q.items()}
            )
        return vf_dict, policy
Пример #7
0
def optimal_vf_and_policy(
        steps: Sequence[StateActionMapping[S, A]], gamma: float
) -> Iterator[Tuple[V[S], FiniteDeterministicPolicy[S, A]]]:
    '''Use backwards induction to find the optimal value function and optimal
    policy at each time step

    '''
    v_p: List[Tuple[V[S], FiniteDeterministicPolicy[S, A]]] = []

    for step in reversed(steps):
        this_v: Dict[NonTerminal[S], float] = {}
        this_a: Dict[S, A] = {}
        for s, actions_map in step.items():
            action_values = ((res.expectation(lambda s_r: s_r[1] + gamma * (
                extended_vf(v_p[-1][0], s_r[0]) if len(v_p) > 0 else 0.)), a)
                             for a, res in actions_map.items())
            v_star, a_star = max(action_values, key=itemgetter(0))
            this_v[s] = v_star
            this_a[s.state] = a_star
        v_p.append((this_v, FiniteDeterministicPolicy(this_a)))

    return reversed(v_p)
Пример #8
0
    from pprint import pprint
    ii = 12
    steps = 8
    pairs = [(1.0, 0.5), (0.7, 1.0), (0.5, 1.5), (0.3, 2.5)]
    cp: ClearancePricingMDP = ClearancePricingMDP(initial_inventory=ii,
                                                  time_steps=steps,
                                                  price_lambda_pairs=pairs)
    print("Clearance Pricing MDP")
    print("---------------------")
    print(cp.mdp)

    def policy_func(x: int) -> int:
        return 0 if x < 2 else (1 if x < 5 else (2 if x < 8 else 3))

    stationary_policy: FiniteDeterministicPolicy[int, int] = \
        FiniteDeterministicPolicy({s: policy_func(s) for s in range(ii + 1)})

    single_step_mrp: FiniteMarkovRewardProcess[int] = \
        cp.single_step_mdp.apply_finite_policy(stationary_policy)

    vf_for_policy: Iterator[V[int]] = evaluate(
        unwrap_finite_horizon_MRP(finite_horizon_MRP(single_step_mrp, steps)),
        1.)

    print("Value Function for Stationary Policy")
    print("------------------------------------")
    for t, vf in enumerate(vf_for_policy):
        print(f"Time Step {t:d}")
        print("---------------")
        pprint(vf)
Пример #9
0
    si_mdp: FiniteMarkovDecisionProcess[InventoryState, int] =\
        SimpleInventoryMDPCap(
            capacity=user_capacity,
            poisson_lambda=user_poisson_lambda,
            holding_cost=user_holding_cost,
            stockout_cost=user_stockout_cost
        )

    print("MDP Transition Map")
    print("------------------")
    print(si_mdp)

    fdp: FiniteDeterministicPolicy[InventoryState, int] = \
        FiniteDeterministicPolicy(
            {InventoryState(alpha, beta): user_capacity - (alpha + beta)
             for alpha in range(user_capacity + 1)
             for beta in range(user_capacity + 1 - alpha)}
    )

    print("Deterministic Policy Map")
    print("------------------------")
    print(fdp)

    implied_mrp: FiniteMarkovRewardProcess[InventoryState] =\
        si_mdp.apply_finite_policy(fdp)
    print("Implied MP Transition Map")
    print("--------------")
    print(
        FiniteMarkovProcess({
            s.state: Categorical({s1.state: p
                                  for s1, p in v.table().items()})