Exemplo n.º 1
0
    def setUp(self):
        user_capacity = 2
        user_poisson_lambda = 1.0
        user_holding_cost = 1.0
        user_stockout_cost = 10.0

        self.gamma = 0.9

        self.si_mdp: FiniteMarkovDecisionProcess[InventoryState, int] =\
            SimpleInventoryMDPCap(
                capacity=user_capacity,
                poisson_lambda=user_poisson_lambda,
                holding_cost=user_holding_cost,
                stockout_cost=user_stockout_cost
            )

        self.fdp: FinitePolicy[InventoryState, int] = FinitePolicy({
            InventoryState(alpha, beta):
            Constant(user_capacity - (alpha + beta))
            for alpha in range(user_capacity + 1)
            for beta in range(user_capacity + 1 - alpha)
        })

        self.implied_mrp: FiniteMarkovRewardProcess[InventoryState] =\
            self.si_mdp.apply_finite_policy(self.fdp)

        self.states: Sequence[InventoryState] = \
            self.implied_mrp.non_terminal_states
Exemplo n.º 2
0
    def test_flip_flop(self):
        trace = list(
            itertools.islice(
                self.flip_flop.simulate(Constant(NonTerminal(True))), 10))

        self.assertTrue(
            all(isinstance(outcome.state, bool) for outcome in trace))

        longer_trace = itertools.islice(
            self.flip_flop.simulate(Constant(NonTerminal(True))), 10000)
        count_trues = len(
            list(outcome for outcome in longer_trace if outcome.state))

        # If the code is correct, this should fail with a vanishingly
        # small probability
        self.assertTrue(1000 < count_trues < 9000)
 def get_opt_vf_and_policy(self) -> \
         Iterator[Tuple[V[int], FinitePolicy[int, bool]]]:
     dt: float = self.dt()
     up_factor: float = np.exp(self.vol * np.sqrt(dt))
     up_prob: float = (np.exp(self.rate * dt) * up_factor - 1) / \
         (up_factor * up_factor - 1)
     return optimal_vf_and_policy(
         steps=[
             {j: None if j == -1 else {
                 True: Constant(
                     (
                         -1,
                         self.payoff(i * dt, self.state_price(i, j))
                     )
                 ),
                 False: Categorical(
                     {
                         (j + 1, 0.): up_prob,
                         (j, 0.): 1 - up_prob
                     }
                 )
             } for j in range(i + 1)}
             for i in range(self.num_steps + 1)
         ],
         gamma=np.exp(-self.rate * dt)
     )
Exemplo n.º 4
0
    def setUp(self):
        ii = 12
        self.steps = 8
        pairs = [(1.0, 0.5), (0.7, 1.0), (0.5, 1.5), (0.3, 2.5)]
        self.cp: ClearancePricingMDP = ClearancePricingMDP(
            initial_inventory=ii,
            time_steps=self.steps,
            price_lambda_pairs=pairs)

        def policy_func(x: int) -> int:
            return 0 if x < 2 else (1 if x < 5 else (2 if x < 8 else 3))

        stationary_policy: FinitePolicy[int, int] = FinitePolicy(
            {s: Constant(policy_func(s))
             for s in range(ii + 1)})

        self.single_step_mrp: FiniteMarkovRewardProcess[
            int] = self.cp.single_step_mdp.apply_finite_policy(
                stationary_policy)

        self.mrp_seq = unwrap_finite_horizon_MRP(
            finite_horizon_MRP(self.single_step_mrp, self.steps))

        self.single_step_mdp: FiniteMarkovDecisionProcess[
            int, int] = self.cp.single_step_mdp

        self.mdp_seq = unwrap_finite_horizon_MDP(
            finite_horizon_MDP(self.single_step_mdp, self.steps))
Exemplo n.º 5
0
    def get_q_learning_vf_and_policy(
            self,
            states_actions_dict: Mapping[Cell, Optional[Set[Move]]],
            sample_func: Callable[[Cell, Move], Tuple[Cell, float]],
            episodes: int = 10000,
            step_size: float = 0.01,
            epsilon: float = 0.1) -> Tuple[V[Cell], FinitePolicy[Cell, Move]]:
        '''
        states_actions_dict gives us the set of possible moves from
        a non-block cell.
        sample_func is a function with two inputs: state and action,
        and with output as a sampled pair of (next_state, reward).
        '''
        q: Dict[Cell, Dict[Move, float]] = \
            {s: {a: 0. for a in actions} for s, actions in
             states_actions_dict.items() if actions is not None}
        nt_states: CellSet = {s for s in q}
        uniform_states: Choose[Cell] = Choose(nt_states)
        for episode_num in range(episodes):
            state: Cell = uniform_states.sample()
            '''
            write your code here
            update the dictionary q initialized above according
            to the Q-learning algorithm's Q-Value Function updates.
            '''

        vf_dict: V[Cell] = {s: max(d.values()) for s, d in q.items()}
        policy: FinitePolicy[Cell, Move] = FinitePolicy({
            s: Constant(max(d.items(), key=itemgetter(1))[0])
            for s, d in q.items()
        })
        return (vf_dict, policy)
Exemplo n.º 6
0
def reward_MRP_simulation(
        alpha: float, gamma: float, num_time: int, stock_MP3: StockPriceMP3,
        init_price: float, f: Callable[[float],
                                       float]) -> List[Tuple[StateMP3, float]]:
    """
    Simulate reward from MRP stock price model for fixed time interval.

    :param alpha: Reverse-pull strength in stock model
    :param gamma: discount factor
    :param num_time: Number of time steps during which to record simulation
    :param stock_MP3: Markov process representation of the stock price model
    :param init_price: Initial stock price
    :param f: Function computing reward at time t from state at time t
    :returns: List of (state, reward) tuples obtained by MRP simulation
    """
    stock_MRP3: StockPriceMRP3 = StockPriceMRP3(alpha, stock_MP3, init_price,
                                                f)
    start: Constant = Constant(StateMP3(0, 0))
    return [
        (step.next_state, step.reward * gamma ** (t+1)) \
            for t, step in enumerate(
                itertools.islice(
                    stock_MRP3.simulate_reward(start),
                    num_time+1
                )
            )
    ]
Exemplo n.º 7
0
def main(num_pads):
    # 2^(num_pads-2) deterministic policies
    fc_mdp: FiniteMarkovDecisionProcess[FrogState,
                                        Any] = FrogCroak(num_pads + 1)
    all_fp = list(itertools.product(['A', 'B'], repeat=fc_mdp.num_pads - 2))
    all_mrp_value = []
    for fp in all_fp:
        fdp: FinitePolicy[FrogState, Any] = FinitePolicy(
            {FrogState(i + 1): Constant(fp[i])
             for i in range(len(fp))})
        implied_mrp: FiniteMarkovRewardProcess[
            FrogState] = fc_mdp.apply_finite_policy(fdp)
        all_mrp_value.append(implied_mrp.get_value_function_vec(1))

    # find the optimized policy
    max_indices = []
    value_matrix = np.array(all_mrp_value)
    for i in range(num_pads - 1):
        max_indices.append(np.argmax(value_matrix[:, i]))
    max_index = list(set(max_indices))[0]
    print(value_matrix[max_index, :])
    print(all_fp[max_index])
    plt.plot([
        'State' + str(i + 1) + ',' + all_fp[max_index][i]
        for i in range(num_pads - 1)
    ], value_matrix[max_index, :], 'o')
    plt.xlabel('Frog State')
    plt.ylabel('Probability')
    plt.title('n = ' + str(num_pads - 1))
    plt.show()
Exemplo n.º 8
0
 def act(self, state: State) -> Optional[Distribution[Action]]:
     delta_b:float = (2*state.I+1)*self.gamma*self.sigma**2*\
         (self.T-state.t)/2+1/self.gamma*np.log(1+self.gamma/self.k)
     delta_a:float = (-2*state.I+1)*self.gamma*self.sigma**2*\
         (self.T-state.t)/2+1/self.gamma*np.log(1+self.gamma/self.k)
     Pb:float = state.S-delta_b
     Pa:float = state.S+delta_a
     return Constant(Action(Pb = Pb,Pa = Pa))
Exemplo n.º 9
0
 def get_mapping(self) -> StateActionMapping[State, Action]:
     #We need to define the StateActionMapping for this Finite MDP
     mapping: StateActionMapping[State, Action] = {}
     list_actions: List[Action] = []
     #We start by defining all the available actions
     for i in range(self.H + 1):
         range_j = self.H - i
         for j in range(range_j + 1):
             list_actions.append(Action(i, j))
     self.list_actions: List[Action] = list_actions
     list_states: List[State] = []
     #Then we define all the possible states
     for i in range(1, self.W + 1):
         list_states.append(State(i))
     self.list_states: List[State] = list_states
     for state in list_states:
         submapping: ActionMapping[Action, StateReward[State]] = {}
         for action in list_actions:
             s: int = action.s
             l: int = action.l
             reward: float = state.wage * (self.H - l - s)
             pois_mean: float = self.alpha * l
             proba_offer: float = self.beta * s / self.H
             if state.wage == self.W:
                 #If you're in state W, you stay in state W with constant
                 #Probability. The reward only depends on the action you
                 #you have chosen
                 submapping[action] = Constant((state, reward))
             elif state.wage == self.W - 1:
                 #If you're in state W-1, you can either stay in your state
                 #or land in state W
                 submapping[action] = Categorical({
                     (state,
                      reward):
                         poisson.pmf(0,pois_mean)*(1-proba_offer),
                      (State(self.W),
                       reward):proba_offer+(1-proba_offer)*\
                          (1-poisson.pmf(0,pois_mean))
                     })
             else:
                 #If you're in any other state, you can land to any state
                 #Between your current state and W with probabilities
                 #as described before
                 dic_distrib = {}
                 dic_distrib[(state, reward)] = poisson.pmf(
                     0, pois_mean) * (1 - proba_offer)
                 dic_distrib[
                     (State(state.wage+1),
                      reward)] = proba_offer*poisson.cdf(1,pois_mean)\
                             +(1-proba_offer)*poisson.pmf(1,pois_mean)
                 for k in range(2, self.W - state.wage):
                     dic_distrib[(State(state.wage + k),
                                  reward)] = poisson.pmf(k, pois_mean)
                 dic_distrib[(State(self.W), reward)] = 1 - poisson.cdf(
                     self.W - state.wage - 1, pois_mean)
                 submapping[action] = Categorical(dic_distrib)
         mapping[state] = submapping
     return mapping
Exemplo n.º 10
0
def process_traces(time_steps: int, num_traces: int,
                   game: SnakesAndLaddersGame) -> np.array:
    start_state_distribution = Constant(StateSnakeAndLadder(position=1))
    array_length = []
    for i in range(num_traces):
        new_val = np.fromiter((s.position for s in itertools.islice(
            game.simulate(start_state_distribution), time_steps + 1)), float)
        array_length += [len(new_val)]
    return np.array(array_length)
Exemplo n.º 11
0
    def test_flip_flop(self):
        trace = list(
            itertools.islice(self.flip_flop.simulate_reward(Constant(True)),
                             10))

        self.assertTrue(all(isinstance(outcome, bool) for outcome, _ in trace))

        cumulative_reward = sum(reward for _, reward in trace)
        self.assertTrue(0 <= cumulative_reward <= 10)
Exemplo n.º 12
0
def process1_price_traces(start_price: int, level_param: int, alpha1: float,
                          time_steps: int, num_traces: int) -> np.ndarray:
    mp = StockPriceMP1(level_param=level_param, alpha1=alpha1)
    start_state_distribution = Constant(StateMP1(price=start_price))
    return np.vstack([
        np.fromiter((s.price for s in itertools.islice(
            mp.simulate(start_state_distribution), time_steps + 1)), float)
        for _ in range(num_traces)
    ])
Exemplo n.º 13
0
    def test_choose(self):
        assert_almost_equal(self, self.one, Constant(1))
        self.assertAlmostEqual(self.one.probability(1), 1.)
        self.assertAlmostEqual(self.one.probability(0), 0.)

        categorical_six = Categorical({x: 1 / 6 for x in range(1, 7)})
        assert_almost_equal(self, self.six, categorical_six)
        self.assertAlmostEqual(self.six.probability(1), 1 / 6)
        self.assertAlmostEqual(self.six.probability(0), 0.)
Exemplo n.º 14
0
        def act(self, s: S) -> Optional[Distribution[A]]:
            if mdp.is_terminal(s):
                return None

            if explore.sample():
                return Choose(set(mdp.actions(s)))

            _, action = q.argmax((s, a) for a in mdp.actions(s))
            return Constant(action)
Exemplo n.º 15
0
 def get_all_deterministic_policies(self) -> Sequence[FinitePolicy[LilypadState, str]]:
     bin_to_act = {'0':'A', '1':'B'}
     all_action_comb = self.get_all_action_combinations()
     all_policies = []
     for action_comb in all_action_comb:
         policy: FinitePolicy[LilypadState,str] = FinitePolicy(
             {LilypadState(i+1): Constant(bin_to_act[a]) for i,a in enumerate(action_comb)}
         )
         all_policies.append(policy)
     return all_policies
 def act(self, state: S) -> Constant[A]:
     return Constant(
         max(
             (
                 (mdp.step(state, a).expectation(return_), a)
                 for a in mdp.actions(state)
             ),
             key=itemgetter(0),
         )[1]
     )
Exemplo n.º 17
0
    def test_flip_flop(self):
        trace = list(
            itertools.islice(self.flip_flop.simulate_reward(Constant(True)),
                             10))

        self.assertTrue(
            all(isinstance(step.next_state, bool) for step in trace))

        cumulative_reward = sum(step.reward for step in trace)
        self.assertTrue(0 <= cumulative_reward <= 10)
Exemplo n.º 18
0
def process2_price_traces(start_price: int, alpha2: float, time_steps: int,
                          num_traces: int) -> np.ndarray:
    mp = StockPriceMP2(alpha2=alpha2)
    start_state_distribution = Constant(
        StateMP2(price=start_price, is_prev_move_up=None))
    return np.vstack([
        np.fromiter((s.price for s in itertools.islice(
            mp.simulate(start_state_distribution), time_steps + 1)), float)
        for _ in range(num_traces)
    ])
 def act(self, state: S) -> Constant[A]:
     return Constant(
         max(
             (
                 (res.expectation(return_), a)
                 for a, res in step[state].items()
             ),
             key=itemgetter(0),
         )[1]
     )
Exemplo n.º 20
0
    def test_optimal_policy(self):
        finite = finite_horizon_MDP(self.finite_flip_flop, limit=10)
        steps = unwrap_finite_horizon_MDP(finite)
        *v_ps, (v, p) = optimal_vf_and_policy(steps, gamma=1)

        for s in p.states():
            self.assertEqual(p.act(s), Constant(False))

        self.assertAlmostEqual(v_ps[0][0][True], 17)
        self.assertAlmostEqual(v_ps[5][0][False], 17 / 2)
Exemplo n.º 21
0
def get_opt_vf_from_q(q_value:Mapping[Tuple[S,A],float])\
    ->Tuple[Mapping[S,float],FinitePolicy[S,A]]:
    v: Mapping[S, float] = {}
    policy_map: Mapping[S, Optional[Constant[A]]] = {}
    for i in q_value:
        state, action = i
        if state not in v.keys() or q_value[i] > v[state]:
            v[state] = q_value[i]
            policy_map[state] = Constant(action)
    Pi = FinitePolicy(policy_map)
    return (v, Pi)
Exemplo n.º 22
0
def get_policies(n)->Iterable[FinitePolicy[StatePond,Action]]: 
    list_policies: Iterable[FinitePolicy[StatePond,Action]] = []
    liste_actions:list = list(itertools.product(['A','B'],repeat=n-1))
    for i in liste_actions:
        policy_map: Mapping[StatePond, Optional[FiniteDistribution[Action]]] = {}
        policy_map[StatePond(0)] = None
        policy_map[StatePond(n)] = None
        for j in range(0,n-1):
            policy_map[StatePond(j+1)] = Constant(Action(i[j]))
        list_policies+=[FinitePolicy(policy_map)]
    return list_policies
Exemplo n.º 23
0
def process3_price_traces(start_price: int, alpha3: float, time_steps: int,
                          num_traces: int) -> np.ndarray:
    mp = StockPriceMP3(alpha3=alpha3)
    start_state_distribution = Constant(
        StateMP3(num_up_moves=0, num_down_moves=0))
    return np.vstack([
        np.fromiter(
            (start_price + s.num_up_moves - s.num_down_moves
             for s in itertools.islice(mp.simulate(start_state_distribution),
                                       time_steps + 1)), float)
        for _ in range(num_traces)
    ])
Exemplo n.º 24
0
def get_vf_and_policy_from_qvf(
        mdp: FiniteMarkovDecisionProcess[S, A],
        qvf: FunctionApprox[Tuple[S, A]]) -> Tuple[V[S], FinitePolicy[S, A]]:
    opt_vf: V[S] = {
        s: max(qvf((s, a)) for a in mdp.actions(s))
        for s in mdp.non_terminal_states
    }
    opt_policy: FinitePolicy[S, A] = FinitePolicy({
        s: Constant(qvf.argmax((s, a) for a in mdp.actions(s))[1])
        for s in mdp.non_terminal_states
    })
    return opt_vf, opt_policy
Exemplo n.º 25
0
            def step(self, state: float,
                     action: bool) -> SampledDistribution[Tuple[float, float]]:
                if action:
                    return Constant((state, payoffs(state)))
                else:

                    def sr_sampler_func(state=state,
                                        action=action) -> Tuple[float, float]:
                        next_state_price: float = asset_distribution.sample()
                        reward: float = 0
                        return (next_state_price, reward)

                    return SampledDistribution(sampler=sr_sampler_func,
                                               expectation_samples=1000)
Exemplo n.º 26
0
def frog_problem_traces(num_lilypads: int, n_traces: int) -> np.ndarray:
    """Simulate frog problem to predict expected hops required to cross river.

    In each simulation, the frog starts on a riverbank, so the starting state
    will always be equal to the total number of lilypads in the simulation.

    :param num_lilypads: Number of lilypads between riverbanks
    :param n_traces: Number of traces to generate
    :return: Hopping counts required to cross the river obtained from traces
    """
    frog_problem_sim = FrogProblemMPFinite(n_lilypads=num_lilypads)
    start_state = Constant(FrogState(num_lilypads))
    return np.fromiter((len(list(trace)) for trace in itertools.islice(
        frog_problem_sim.traces(start_state), n_traces + 1)), int)
Exemplo n.º 27
0
def greedy_policy_from_vf(mdp: FiniteMarkovDecisionProcess[S, A], vf: V[S],
                          gamma: float) -> FinitePolicy[S, A]:
    greedy_policy_dict: Dict[S, FiniteDistribution[A]] = {}

    for s in mdp.non_terminal_states:

        q_values: Iterator[Tuple[A, float]] = \
            ((a, mdp.mapping[s][a].expectation(
                lambda s_r: s_r[1] + gamma * vf.get(s_r[0], 0.)
            )) for a in mdp.actions(s))

        greedy_policy_dict[s] =\
            Constant(max(q_values, key=operator.itemgetter(1))[0])

    return FinitePolicy(greedy_policy_dict)
Exemplo n.º 28
0
    def get_action_transition_reward_map(self, maze: Maze):
        d: Dict[GridState, Dict[str, Categorical[Tuple[GridState,
                                                       float]]]] = {}

        for x in range(maze.nx):
            for y in range(maze.ny):
                state = GridState(x, y)
                if state != self.goal:
                    d1: Dict[str, Categorical[Tuple[GridState, float]]] = {}
                    cell = maze.cell_at(x, y)
                    for move, next_cell in maze.find_valid_neighbours(cell):
                        if not cell.has_wall_at(move):
                            next_state = GridState(next_cell.x, next_cell.y)
                            d1[move] = Constant(
                                (next_state, self.reward_func(next_state)))
                    d[state] = d1
        return d
Exemplo n.º 29
0
    def fraction_of_days_oos(self, policy: Policy[InventoryState, int],
                             time_steps: int, num_traces: int) -> float:
        impl_mrp: MarkovRewardProcess[InventoryState] =\
            self.apply_policy(policy)
        count: int = 0
        high_fractile: int = int(poisson(self.poisson_lambda).ppf(0.98))
        start: InventoryState = random.choice(
            [InventoryState(i, 0) for i in range(high_fractile + 1)])

        for _ in range(num_traces):
            steps = itertools.islice(impl_mrp.simulate_reward(Constant(start)),
                                     time_steps)
            for step in steps:
                if step.reward < -self.holding_cost * step.state.on_hand:
                    count += 1

        return float(count) / (time_steps * num_traces)
Exemplo n.º 30
0
            def step(self, state: Tuple[int, float],
                     action: bool) -> SampledDistribution[Tuple[int, float]]:
                if state[0] > expiry_time or state[0] == -1:
                    return None
                elif action:
                    return Constant(((-1, state[1]), payoffs(state[1])))
                else:

                    def sr_sampler_func(
                            state=state,
                            action=action) -> Tuple[Tuple[int, float], float]:
                        next_state_price: float = asset_distribution.sample()
                        next_state_time = state[0] + 1
                        reward: float = 0
                        return ((next_state_time, next_state_price), reward)

                    return SampledDistribution(sampler=sr_sampler_func,
                                               expectation_samples=1000)