예제 #1
0
    def __init__(
        self,
        transition_reward_map: Mapping[S, FiniteDistribution[Tuple[S,
                                                                   float]]]):
        transition_map: Dict[S, FiniteDistribution[S]] = {}

        for state, trans in transition_reward_map.items():
            probabilities: Dict[S, float] = defaultdict(float)
            for (next_state, _), probability in trans:
                probabilities[next_state] += probability

            transition_map[state] = Categorical(probabilities)

        super().__init__(transition_map)

        nt: Set[S] = set(transition_reward_map.keys())
        self.transition_reward_map = {
            NonTerminal(s):
            Categorical({(NonTerminal(s1) if s1 in nt else Terminal(s1), r): p
                         for (s1, r), p in v.table().items()})
            for s, v in transition_reward_map.items()
        }

        self.reward_function_vec = np.array([
            sum(probability * reward
                for (_,
                     reward), probability in self.transition_reward_map[state])
            for state in self.non_terminal_states
        ])
예제 #2
0
파일: prob_4.2.py 프로젝트: lkourti/RL-book
    def get_action_transition_reward_map(self) -> LilypadSoundMapping:
        d: Dict[LilypadState, Dict[str, Categorical[Tuple[LilypadState,
                                                            float]]]] = {}
        for lilypad in range(1,self.num_lilypads-1):
            state: LilypadState = LilypadState(lilypad)
            d1: Dict[str,Categorical[Tuple[LilypadState, float]]] = {}

            # sound A
            sr_probs_dict_a: Dict[Tuple[LilypadState, float], float] = {}
            sr_probs_dict_a[(LilypadState(lilypad - 1), 0)] = lilypad / (self.num_lilypads-1)
            if lilypad + 1 == self.num_lilypads-1:
                sr_probs_dict_a[(LilypadState(lilypad + 1), 1)] = 1 - lilypad/(self.num_lilypads-1)
            else:
                sr_probs_dict_a[(LilypadState(lilypad + 1), 0)] = 1 - lilypad /(self.num_lilypads-1)
            d1['A'] = Categorical(sr_probs_dict_a)

            # sound B
            sr_probs_dict_b: Dict[Tuple[LilypadState, float], float] = {}
            for i in range(self.num_lilypads):
                if i != lilypad:
                    if i == self.num_lilypads - 1:
                        sr_probs_dict_b[(LilypadState(i), 1)] = 1/(self.num_lilypads-1)
                    else:
                        sr_probs_dict_b[(LilypadState(i), 0)] = 1 /(self.num_lilypads-1)
            d1['B'] = Categorical(sr_probs_dict_b)

            d[state] = d1
        return d
예제 #3
0
파일: frogcroak.py 프로젝트: GYY7/RL-book
    def get_action_transition_reward_map(self) -> FrogCroakMapping:
        d: Dict[FrogState, Dict[Any, Categorical[Tuple[FrogState,
                                                       float]]]] = {}

        for i in range(self.num_pads):
            state = FrogState(i)
            if i == 0 or i == self.num_pads - 1:
                d[state] = None
            else:
                d1: Dict[Any, Categorical[Tuple[FrogState, float]]] = {}
                _reward = 0
                if i == self.num_pads - 2:
                    _reward = 1 - i / (self.num_pads - 1)
                prob_dict_a: Dict[Tuple[FrogState, float], float] = {
                    (FrogState(i + 1), 0): 1 - i / (self.num_pads - 1),
                    (FrogState(i - 1), _reward): i / (self.num_pads - 1)
                }
                d1['A'] = Categorical(prob_dict_a)
                prob_dict_b: Dict[Tuple[FrogState, float], float] = {
                    (FrogState(j), 0): 1 / (self.num_pads - 1)
                    for j in range(self.num_pads)
                    if j != i or j != self.num_pads - 2
                }
                prob_dict_b[(FrogState(self.num_pads - 2), 1 /
                             (self.num_pads - 1))] = 1 / (self.num_pads - 1)
                d1['B'] = Categorical(prob_dict_b)
                d[state] = d1
        return d
예제 #4
0
    def get_action_transition_reward_map(self) -> PadMapping:
        d: Dict[FrogEscapeState, Dict[int, Categorical[Tuple[FrogEscapeState,
                                                             float]]]] = {}

        # 0 state
        state0: FrogEscapeState = FrogEscapeState(0)
        d[state0] = None

        for i in range(1, self.n):
            state: FrogEscapeState = FrogEscapeState(i)
            di: Dict[int, Categorical[Tuple[FrogEscapeState, float]]] = {}

            # action A
            sr_probs_dict_A: Dict[Tuple[FrogEscapeState, float], float] =\
                        {(FrogEscapeState(i-1), 0.0 if i-1 != 0 else 0.0): i/self.n, (FrogEscapeState(i+1), 0.0 if i+1 != self.n else 1.0): (self.n - i)/self.n}

            di[0] = Categorical(sr_probs_dict_A)

            # action B
            sr_probs_dict_B: Dict[Tuple[FrogEscapeState, float], float] =\
                        {(FrogEscapeState(j), 0.0 if j == 0 else (1.0 if j == self.n else 0.0)): 1/self.n if j != i else 0.0 for j in range(self.n+1)}

            di[1] = Categorical(sr_probs_dict_B)

            # add actions
            d[state] = di

        # n state
        staten: FrogEscapeState = FrogEscapeState(self.n)
        d[staten] = None

        return d
예제 #5
0
 def get_transition_probabilities(self, nt_state: Cell) \
         -> Mapping[Move, Categorical[Tuple[Cell, float]]]:
     '''
     given a non-terminal state, return a dictionary whose
     keys are the valid actions (moves) from the given state
     and the corresponding values are the associated probabilities
     (following that move) of the (next_state, reward) pairs.
     The probabilities are determined from the wind probabilities
     of the column one is in after the move. Note that if one moves
     to a goal cell (terminal state), then one ends up in that
     goal cell with 100% probability (i.e., no wind exposure in a
     goal cell).
     '''
     d: Dict[Move, Categorical[Tuple[Cell, float]]] = {}
     for a, (r, c) in self.get_actions_and_next_states(nt_state):
         if (r, c) in self.terminals:
             d[a] = Categorical({((r, c), -1.): 1.})
         else:
             p1, p2 = self.wind[c][0], self.wind[c][1]
             dist={}
             dist[((r,c), -1)] = 1 - p1 - p2
             dist[((r,c), -(1+self.bump_cost))] = p1*(((r-1,c) in self.blocks)or(r==0)) + p2*(((r+1,c) in self.blocks)or(r==self.rows-1))
             dist[((r-1,c), -1)] = p1*(not(((r-1,c) in self.blocks)or(r==0)))
             dist[((r+1,c), -1)] = p2*(not(((r+1,c) in self.blocks)or(r==self.rows-1)))
             d[a] = Categorical(dist)
     return d
예제 #6
0
 def get_transition_probabilities(self, nt_state: Cell) \
         -> Mapping[Move, Categorical[Tuple[Cell, float]]]:
     '''
     given a non-terminal state, return a dictionary whose
     keys are the valid actions (moves) from the given state
     and the corresponding values are the associated probabilities
     (following that move) of the (next_state, reward) pairs.
     The probabilities are determined from the wind probabilities
     of the column one is in after the move. Note that if one moves
     to a goal cell (terminal state), then one ends up in that
     goal cell with 100% probability (i.e., no wind exposure in a
     goal cell).
     '''
     d: Dict[Move, Categorical[Tuple[Cell, float]]] = {}
     for a, (r, c) in self.get_actions_and_next_states(nt_state):
         if (r, c) in self.terminals:
             d[a] = Categorical({((r, c), -1.): 1.})
         else:
             down_prob, up_prob = self.wind[c]
             stay_prob: float = 1. - down_prob - up_prob
             d1: Dict[Tuple[Cell, float], float] = \
                 {((r, c), -1.): stay_prob}
             if self.is_valid_state((r - 1, c)):
                 d1[((r - 1, c), -1.)] = down_prob
             if self.is_valid_state((r + 1, c)):
                 d1[((r + 1, c), -1.)] = up_prob
             d1[((r, c), -1. - self.bump_cost)] = \
                 down_prob * (1 - self.is_valid_state((r - 1, c))) + \
                 up_prob * (1 - self.is_valid_state((r + 1, c)))
             d[a] = Categorical(d1)
     return d
예제 #7
0
    def setUp(self):
        random.seed(42)

        self.finite_flip_flop = FlipFlop(0.7)

        self.finite_mdp = FiniteMarkovDecisionProcess({
            True: {
                True: Categorical({
                    (True, 1.0): 0.7,
                    (False, 2.0): 0.3
                }),
                False: Categorical({
                    (True, 1.0): 0.3,
                    (False, 2.0): 0.7
                }),
            },
            False: {
                True: Categorical({
                    (False, 1.0): 0.7,
                    (True, 2.0): 0.3
                }),
                False: Categorical({
                    (False, 1.0): 0.3,
                    (True, 2.0): 0.7
                }),
            },
        })
예제 #8
0
파일: prob2.py 프로젝트: lkourti/RL-book
 def get_transition_map(self) -> Transition[BlockState]:
     d: Dict[BlockState, Categorical[BlockState]] = {}
     for i in range(self.board_size - self.dice_size + 1):
         state_probs_map: Mapping[BlockState, float] = {}
         #{BlockState(self.sl_mapping[j]): 1/self.dice_size for j in range(i+1, i+self.dice_size+1)}
         for j in range(i + 1, i + self.dice_size):
             if BlockState(self.sl_mapping[j]) not in state_probs_map:
                 state_probs_map[BlockState(
                     self.sl_mapping[j])] = 1 / self.dice_size
             else:
                 state_probs_map[BlockState(
                     self.sl_mapping[j])] += 1 / self.dice_size
         d[BlockState(i)] = Categorical(state_probs_map)
     for i in range(self.board_size - self.dice_size + 1, self.board_size):
         state_probs_map: Mapping[BlockState, float] = {}
         #{BlockState(self.sl_mapping[j]): 1/self.dice_size for j in range(i+1, self.board_size+1)}
         for j in range(i + 1, self.board_size + 1):
             if BlockState(self.sl_mapping[j]) not in state_probs_map:
                 state_probs_map[BlockState(
                     self.sl_mapping[j])] = 1 / self.dice_size
             else:
                 state_probs_map[BlockState(
                     self.sl_mapping[j])] += 1 / self.dice_size
         state_probs_map[BlockState(
             i)] = (self.dice_size - (self.board_size - i)) / self.dice_size
         d[BlockState(i)] = Categorical(state_probs_map)
     d[BlockState(self.board_size)] = None
     return d
예제 #9
0
def sarsa_control_scratch(
        #traces: Iterable[Iterable[mp.TransitionStep[S]]],
        mdp_to_sample: FiniteMarkovDecisionProcess,
        states: List[S],
        actions: Mapping[S, List[A]],
        γ: float,
        num_episodes: float = 10000,
        eps: float = 0.1,
        base_lr: float = 0.03,
        half_life: float = 1000.0,
        exponent: float = 0.5) -> Mapping[S, float]:

    q: Mapping[Tuple[S, A], float] = {}
    counts_per_state_act: Mapping[Tuple[S, A], int] = {}
    for state in states:
        for action in actions[state]:
            q[(state, action)] = 0.
            counts_per_state_act[(state, action)] = 0
    policy_map: Mapping[S, Optional[Categorical[A]]] = {}
    for state in states:
        if actions[state] is None:
            policy_map[state] = None
        else:
            policy_map[state] = Categorical(
                {action: 1
                 for action in actions[state]})
    Pi: FinitePolicy[S, A] = FinitePolicy(policy_map)
    state = Categorical({state: 1 for state in states}).sample()
    for i in range(num_episodes):
        action_distribution = Pi.act(state)
        action = action_distribution.sample()
        next_distribution = mdp_to_sample.step(state, action)
        next_state, reward = next_distribution.sample()
        next_action = Pi.act(next_state).sample()
        counts_per_state_act[(state, action)] += 1
        alpha = base_lr / (1 + (
            (counts_per_state_act[(state, action)] - 1) / half_life)**exponent)
        #We choose the next action based on epsilon greedy policy
        q[(state,
           action)] += alpha * (reward + γ * q[(next_state, next_action)] -
                                q[(state, action)])
        new_pol: Mapping[S, Optional[Categorical[A]]] = Pi.policy_map
        if actions[state] is None:
            new_pol[state] = None
        policy_map = {
            action: eps / len(actions[state])
            for action in actions[state]
        }
        best_action = actions[state][0]
        for action in actions[state]:
            if q[(state, best_action)] <= q[(state, action)]:
                best_action = action
        policy_map[best_action] += 1 - eps
        new_pol[state] = Categorical(policy_map)
        Pi = FinitePolicy(new_pol)
        state = next_state
        if next_state is None:
            state = Categorical({state: 1 for state in states}).sample()
    return q
예제 #10
0
    def test_unwrap_finite_horizon_MDP(self):
        finite = finite_horizon_MDP(self.finite_flip_flop, 10)
        unwrapped = unwrap_finite_horizon_MDP(finite)

        self.assertEqual(len(unwrapped), 10)

        def action_mapping_for(s: WithTime[bool]) -> \
                ActionMapping[bool, WithTime[bool]]:
            same = NonTerminal(s.step_time())
            different = NonTerminal(dataclasses.replace(
                s.step_time(),
                state=not s.state
            ))

            return {
                True: Categorical({
                    (same, 1.0): 0.7,
                    (different, 2.0): 0.3
                }),
                False: Categorical({
                    (same, 1.0): 0.3,
                    (different, 2.0): 0.7
                })
            }

        for t in range(9):
            for s in True, False:
                s_time = WithTime(state=s, time=t)
                for a in True, False:
                    distribution.assert_almost_equal(
                        self,
                        finite.mapping[NonTerminal(s_time)][a],
                        action_mapping_for(s_time)[a]
                    )

        for s in True, False:
            s_time = WithTime(state=s, time=9)
            same = Terminal(s_time.step_time())
            different = Terminal(dataclasses.replace(
                s_time.step_time(),
                state=not s_time.state
            ))
            act_map = {
                True: Categorical({
                    (same, 1.0): 0.7,
                    (different, 2.0): 0.3
                }),
                False: Categorical({
                    (same, 1.0): 0.3,
                    (different, 2.0): 0.7
                })

            }
            for a in True, False:
                distribution.assert_almost_equal(
                    self,
                    finite.mapping[NonTerminal(s_time)][a],
                    act_map[a]
                )
예제 #11
0
 def get_mapping(self) -> StateActionMapping[State, Action]:
     #We need to define the StateActionMapping for this Finite MDP
     mapping: StateActionMapping[State, Action] = {}
     list_actions: List[Action] = []
     #We start by defining all the available actions
     for i in range(self.H + 1):
         range_j = self.H - i
         for j in range(range_j + 1):
             list_actions.append(Action(i, j))
     self.list_actions: List[Action] = list_actions
     list_states: List[State] = []
     #Then we define all the possible states
     for i in range(1, self.W + 1):
         list_states.append(State(i))
     self.list_states: List[State] = list_states
     for state in list_states:
         submapping: ActionMapping[Action, StateReward[State]] = {}
         for action in list_actions:
             s: int = action.s
             l: int = action.l
             reward: float = state.wage * (self.H - l - s)
             pois_mean: float = self.alpha * l
             proba_offer: float = self.beta * s / self.H
             if state.wage == self.W:
                 #If you're in state W, you stay in state W with constant
                 #Probability. The reward only depends on the action you
                 #you have chosen
                 submapping[action] = Constant((state, reward))
             elif state.wage == self.W - 1:
                 #If you're in state W-1, you can either stay in your state
                 #or land in state W
                 submapping[action] = Categorical({
                     (state,
                      reward):
                         poisson.pmf(0,pois_mean)*(1-proba_offer),
                      (State(self.W),
                       reward):proba_offer+(1-proba_offer)*\
                          (1-poisson.pmf(0,pois_mean))
                     })
             else:
                 #If you're in any other state, you can land to any state
                 #Between your current state and W with probabilities
                 #as described before
                 dic_distrib = {}
                 dic_distrib[(state, reward)] = poisson.pmf(
                     0, pois_mean) * (1 - proba_offer)
                 dic_distrib[
                     (State(state.wage+1),
                      reward)] = proba_offer*poisson.cdf(1,pois_mean)\
                             +(1-proba_offer)*poisson.pmf(1,pois_mean)
                 for k in range(2, self.W - state.wage):
                     dic_distrib[(State(state.wage + k),
                                  reward)] = poisson.pmf(k, pois_mean)
                 dic_distrib[(State(self.W), reward)] = 1 - poisson.cdf(
                     self.W - state.wage - 1, pois_mean)
                 submapping[action] = Categorical(dic_distrib)
         mapping[state] = submapping
     return mapping
예제 #12
0
def mc_control_scratch(
        #traces: Iterable[Iterable[mp.TransitionStep[S]]],
        mdp_to_sample: FiniteMarkovDecisionProcess,
        states: List[S],
        actions: Mapping[S, List[A]],
        γ: float,
        tolerance: float = 1e-6,
        num_episodes: float = 10000) -> Mapping[Tuple[S, A], float]:

    q: Mapping[Tuple[S, A], float] = {}
    counts_per_state_act: Mapping[Tuple[S, A], int] = {}
    for state in states:
        for action in actions[state]:
            q[(state, action)] = 0.
            counts_per_state_act[(state, action)] = 0
    policy_map: Mapping[S, Optional[Categorical[A]]] = {}
    for state in states:
        if actions[state] is None:
            policy_map[state] = None
        else:
            policy_map[state] = Categorical(
                {action: 1
                 for action in actions[state]})
    Pi: FinitePolicy[S, A] = FinitePolicy(policy_map)
    start_state_distrib = Categorical({state: 1 for state in states})
    for i in range(num_episodes):
        trace: Iterable[TransitionStep[S, A]] = mdp_to_sample.simulate_actions(
            start_state_distrib, Pi)
        episode = returns(trace, γ, tolerance)
        #print(episode)
        for step in episode:
            state = step.state
            action = step.action
            return_ = step.return_
            counts_per_state_act[(state, action)] += 1
            q[(state, action)] += 1 / counts_per_state_act[
                (state, action)] * (return_ - q[(state, action)])
        eps = 1 / (i + 1)
        new_pol: Mapping[S, Optional[Categorical[A]]] = {}
        for state in states:
            if actions[state] is None:
                new_pol[state] = None
            policy_map = {
                action: eps / len(actions[state])
                for action in actions[state]
            }
            best_action = actions[state][0]
            for action in actions[state]:
                if q[(state, best_action)] <= q[(state, action)]:
                    best_action = action
            policy_map[best_action] += 1 - eps
            new_pol[state] = Categorical(policy_map)
        Pi = FinitePolicy(new_pol)

    return q
예제 #13
0
 def transition_for(_):
     return {
         True: Categorical({
             (NonTerminal(True), 1.0): 0.3,
             (NonTerminal(False), 2.0): 0.7
         }),
         False: Categorical({
             (NonTerminal(True), 2.0): 0.7,
             (NonTerminal(False), 1.0): 0.3
         })
     }
예제 #14
0
 def transition_for(time):
     return {
         True: Categorical({
             (True, 1.0): 0.3,
             (False, 2.0): 0.7,
         }),
         False: Categorical({
             (True, 2.0): 0.7,
             (False, 1.0): 0.3,
         })
     }
예제 #15
0
 def setUp(self):
     self.finite_flip_flop = FiniteMarkovDecisionProcess({
         True: {
             True: Categorical({(True, 1.0): 0.7, (False, 2.0): 0.3}),
             False: Categorical({(True, 1.0): 0.3, (False, 2.0): 0.7}),
         },
         False: {
             True: Categorical({(False, 1.0): 0.7, (True, 2.0): 0.3}),
             False: Categorical({(False, 1.0): 0.3, (True, 2.0): 0.7}),
         }
     })
예제 #16
0
        def action_mapping_for(
                s: WithTime[bool]) -> ActionMapping[bool, WithTime[bool]]:
            same = s.step_time()
            different = dataclasses.replace(s.step_time(), state=not s.state)

            return {
                True: Categorical({
                    (same, 1.0): 0.7,
                    (different, 2.0): 0.3
                }),
                False: Categorical({
                    (same, 1.0): 0.3,
                    (different, 2.0): 0.7
                })
            }
예제 #17
0
 def get_opt_vf_and_policy(self) -> \
         Iterator[Tuple[V[int], FinitePolicy[int, bool]]]:
     dt: float = self.dt()
     up_factor: float = np.exp(self.vol * np.sqrt(dt))
     up_prob: float = (np.exp(self.rate * dt) * up_factor - 1) / \
         (up_factor * up_factor - 1)
     return optimal_vf_and_policy(
         steps=[
             {j: None if j == -1 else {
                 True: Constant(
                     (
                         -1,
                         self.payoff(i * dt, self.state_price(i, j))
                     )
                 ),
                 False: Categorical(
                     {
                         (j + 1, 0.): up_prob,
                         (j, 0.): 1 - up_prob
                     }
                 )
             } for j in range(i + 1)}
             for i in range(self.num_steps + 1)
         ],
         gamma=np.exp(-self.rate * dt)
     )
예제 #18
0
class TestFiniteDistribution(unittest.TestCase):
    def setUp(self):
        self.die = Choose({1, 2, 3, 4, 5, 6})

        self.ragged = Categorical({0: 0.9, 1: 0.05, 2: 0.025, 3: 0.025})

    def test_map(self):
        plusOne = self.die.map(lambda x: x + 1)
        assert_almost_equal(self, plusOne, Choose({2, 3, 4, 5, 6, 7}))

        evenOdd = self.die.map(lambda x: x % 2 == 0)
        assert_almost_equal(self, evenOdd, Choose({True, False}))

        greaterThan4 = self.die.map(lambda x: x > 4)
        assert_almost_equal(self, greaterThan4,
                            Categorical({
                                True: 1 / 3,
                                False: 2 / 3
                            }))

    def test_expectation(self):
        self.assertAlmostEqual(self.die.expectation(float), 3.5)

        even = self.die.map(lambda n: n % 2 == 0)
        self.assertAlmostEqual(even.expectation(float), 0.5)

        self.assertAlmostEqual(self.ragged.expectation(float), 0.175)
예제 #19
0
    def test_constant(self):
        assert_almost_equal(self, self.fair,
                            Categorical({
                                True: 0.5,
                                False: 0.5
                            }))
        self.assertAlmostEqual(self.fair.probability(True), 0.5)
        self.assertAlmostEqual(self.fair.probability(False), 0.5)

        assert_almost_equal(self, self.unfair,
                            Categorical({
                                True: 0.3,
                                False: 0.7
                            }))
        self.assertAlmostEqual(self.unfair.probability(True), 0.3)
        self.assertAlmostEqual(self.unfair.probability(False), 0.7)
예제 #20
0
    def transition(self, state: StateMP2) -> Categorical[StateMP2]:
        up_p = self.up_prob(state)

        return Categorical({
            StateMP2(state.price + 1, True): up_p,
            StateMP2(state.price - 1, False): 1 - up_p
        })
예제 #21
0
    def transition(self, state: StateMP1) -> Categorical[StateMP1]:
        up_p = self.up_prob(state)

        return Categorical({
            StateMP1(state.price + 1): up_p,
            StateMP1(state.price - 1): 1 - up_p
        })
예제 #22
0
    def get_action_transition_reward_map(self) -> FrogCroakMapping:
        """Get the mapping from states and actions to rewards and new states.
        """
        d: Dict[FrogState, Dict[Croak,
                                Categorical[Tuple[FrogState, float]]]] = {
                                    FrogState(self.river.n_lily): None,
                                    FrogState(0): None
                                }

        for position in range(1, self.river.n_lily):
            state: FrogState = FrogState(position)
            d1: Dict[Croak, Categorical[Tuple[FrogState, float]]] = {}

            for croak_A in (True, False):
                sr_probs_dict: Dict[Tuple[FrogState, float], float] =\
                    {
                        (FrogState(next_p), self.reward_at_position(next_p)):
                        self.get_transit_prob_from_croak(
                            state, FrogState(next_p), croak_A
                        ) for next_p in range(self.river.n_lily+1)
                    }
                d1[croak_A] = Categorical(sr_probs_dict)

            d[state] = d1
        return d
예제 #23
0
    def get_action_transition_reward_map(self) -> InvOrderMapping:
        d: Dict[InventoryState, Dict[int, Categorical[Tuple[InventoryState,
                                                            float]]]] = {}

        for alpha in range(self.capacity + 1):
            for beta in range(self.capacity + 1 - alpha):
                state: InventoryState = InventoryState(alpha, beta)
                ip: int = state.inventory_position()
                base_reward: float = -self.holding_cost * alpha
                d1: Dict[int, Categorical[Tuple[InventoryState, float]]] = {}

                for order in range(self.capacity - ip + 1):
                    sr_probs_dict: Dict[Tuple[InventoryState, float], float] =\
                        {(InventoryState(ip - i, order), base_reward):
                         self.poisson_distr.pmf(i) for i in range(ip)}

                    probability: float = 1 - self.poisson_distr.cdf(ip - 1)
                    reward: float = base_reward - self.stockout_cost *\
                        (probability * (self.poisson_lambda - ip) +
                         ip * self.poisson_distr.pmf(ip))
                    sr_probs_dict[(InventoryState(0, order), reward)] = \
                        probability
                    d1[order] = Categorical(sr_probs_dict)

                d[state] = d1
        return d
예제 #24
0
 def get_transition_reward_map(
         self) -> RewardTransition[StateSnakeAndLadder]:
     d: Dict[StateSnakeAndLadder, Categorical[Tuple[StateSnakeAndLadder,
                                                    float]]] = {}
     dic_mapping = {}
     for i in self.grid:
         dic_mapping[i.start] = i.end
     for i in range(1, self.grid_size):
         state = StateSnakeAndLadder(position=i)
         dic_positions_associated: dict = {}
         for j in range(i + 1, i + 7):
             if (dic_mapping.get(j)) is not None:
                 new_pos = dic_mapping[j]
             else:
                 new_pos = j
             if new_pos > self.grid_size:
                 new_pos = self.grid_size
             if new_pos in dic_positions_associated.keys():
                 dic_positions_associated[new_pos] += 1
             else:
                 dic_positions_associated[new_pos] = 1
         sr_probs_map: Dict[Tuple[StateSnakeAndLadder, float], float] = \
             {(StateSnakeAndLadder(position = j),1) : dic_positions_associated[j]/6 for j in dic_positions_associated.keys()}
         d[state] = Categorical(sr_probs_map)
     d[StateSnakeAndLadder(position=self.grid_size)] = None
     return d
예제 #25
0
    def get_episode_rewards_actions(self) -> Tuple[ndarray, ndarray]:
        ep_rewards: ndarray = empty(self.time_steps)
        ep_actions: ndarray = empty(self.time_steps, dtype=int)
        scores: List[float] = [0.] * self.num_arms
        avg_reward: float = 0.

        for i in range(self.time_steps):
            max_score: float = max(scores)
            exp_scores: Sequence[float] = [exp(s - max_score) for s in scores]
            sum_exp_scores = sum(exp_scores)
            probs: Sequence[float] = [s / sum_exp_scores for s in exp_scores]
            action: int = Categorical(
                {i: p for i, p in enumerate(probs)}
            ).sample()
            reward: float = self.arm_distributions[action].sample()
            avg_reward += (reward - avg_reward) / (i + 1)
            step_size: float = self.learning_rate *\
                (i / self.learning_rate_decay + 1) ** -0.5
            for j in range(self.num_arms):
                scores[j] += step_size * (reward - avg_reward) *\
                             ((1 if j == action else 0) - probs[j])

            ep_rewards[i] = reward
            ep_actions[i] = action
        return ep_rewards, ep_actions
예제 #26
0
    def test_finite_horizon_MRP(self):
        finite = finite_horizon_MRP(self.finite_flip_flop, 10)

        trues = [NonTerminal(WithTime(True, time)) for time in range(10)]
        falses = [NonTerminal(WithTime(False, time)) for time in range(10)]
        non_terminal_states = set(trues + falses)
        self.assertEqual(set(finite.non_terminal_states), non_terminal_states)

        expected_transition = {}
        for state in non_terminal_states:
            t: int = state.state.time
            st: bool = state.state.state
            if t < 9:
                prob = {
                    (NonTerminal(WithTime(st, t + 1)), 1.0): 0.3,
                    (NonTerminal(WithTime(not st, t + 1)), 2.0): 0.7
                }
            else:
                prob = {
                    (Terminal(WithTime(st, t + 1)), 1.0): 0.3,
                    (Terminal(WithTime(not st, t + 1)), 2.0): 0.7
                }

            expected_transition[state] = Categorical(prob)

        for state in non_terminal_states:
            distribution.assert_almost_equal(
                self,
                finite.transition_reward(state),
                expected_transition[state])
예제 #27
0
    def get_action_transition_reward_map(
            self) -> StateActionMapping[int, DaySplit]:

        d: Dict[int, Dict[DaySplit, Categorical[Tuple[int, int]]]] = {}

        for w in range(1, self.W + 1):
            actionMap: Dict[DaySplit, Categorical[Tuple[int, int]]] = {}
            for l in range(self.H + 1):
                for s in range(self.H + 1 - l):
                    action = DaySplit(l, s, self.H - l - s)
                    probDict = {}
                    probDict[(w, action.w *
                              w)] = poisson(self.alpha * action.l, 0) * (
                                  1 - self.beta * action.s / self.H)
                    probDict[(min(
                        self.W, w + 1), action.w * w)] = probDict.get(
                            (min(self.W, w + 1), action.w * w), 0) + poisson(
                                self.alpha * action.l,
                                1) + poisson(self.alpha * action.l, 0) * (
                                    self.beta * action.s / self.H)
                    for k in range(2, self.W - w):
                        probDict[(w + k, action.w * w)] = poisson(
                            self.alpha * action.l, k)
                    probDict[(self.W, action.w * w)] = probDict.get(
                        (self.W, action.w * w), 0) + 1 - sum(probDict.values())
                    actionMap[action] = Categorical(probDict)
            d[w] = actionMap
        return (d)
예제 #28
0
    def test_finite_horizon_MRP(self):
        finite = finite_horizon_MRP(self.finite_flip_flop, 10)

        trues = [WithTime(True, time) for time in range(0, 10)]
        falses = [WithTime(False, time) for time in range(0, 10)]
        non_terminal_states = set(trues + falses)
        terminal_states = {WithTime(True, 10), WithTime(False, 10)}
        expected_states = non_terminal_states.union(terminal_states)

        self.assertEqual(set(finite.states()), expected_states)

        expected_transition = {}
        for state in non_terminal_states:
            expected_transition[state] =\
                Categorical({
                    (WithTime(state.state, state.time + 1), 1.0): 0.3,
                    (WithTime(not state.state, state.time + 1), 2.0): 0.7
                })

        for state in non_terminal_states:
            distribution.assert_almost_equal(self,
                                             finite.transition_reward(state),
                                             expected_transition[state])

        for state in terminal_states:
            self.assertEqual(finite.transition(state), None)
예제 #29
0
    def get_transition_map(self) -> mp.Transition[mp.S]:
        """
        Get the transition map associated with the playing board.

        The transition map associated with each move is dictated by the uniform
        probability of rolling each value on the die, the landing position
        associated with each possible roll, the presence of a snake or ladder
        at the landing position, and proximity to the end of the board.

        A `GameState` object is instantiated for each tile position on the
        board. That object is mapped to N subsequent positions with uniform
        probability, where N is the number of sides on the playing die.
        Corrections are made to the position attribute of each `GameState` 
        object if the position lies beyond the end of the board or corresponds
        to snakes or ladders.

        :returns: Transition map representing "Snakes and Ladders" board
        """
        d: Dict[GameState, Optional[Categorical[GameState]]] = {
            GameState(self.num_tiles) : None
        }
        for position in range(self.num_tiles):
            game_state = GameState(position)
            game_state_prob_map: Mapping[GameState, float] = {
                GameState(i).check_SL(
                    self.snakes_and_ladders
                ).check_board_end(
                    self.num_tiles, i-position
                ) : self.uniform_distr for i in range(
                    position+1,
                    position+self.dice_sides+1
                )
            }
            d[game_state] = Categorical(game_state_prob_map)
        return d
예제 #30
0
    def get_transition_reward_map(self) -> RewardTransition[SnakesAndLaddersState]:
        d: Dict[SnakesAndLaddersState, Categorical[Tuple[SnakesAndLaddersState, float]]] = {}
        # for alpha in range(self.capacity + 1):
        #     for beta in range(self.capacity + 1 - alpha):
        #         state = SnakesAndLaddersState(alpha, beta)
        #         ip = state.position()
        #         beta1 = self.capacity - ip
        #         base_reward = - self.holding_cost * state.on_hand
        #         sr_probs_map: Dict[Tuple[SnakesAndLaddersState, float], float] =\
        #             {(SnakesAndLaddersState(ip - i, beta1), base_reward):
        #              self.poisson_distr.pmf(i) for i in range(ip)}
        #         probability = 1 - self.poisson_distr.cdf(ip - 1)
        #         reward = base_reward - self.stockout_cost *\
        #             (probability * (self.poisson_lambda - ip) +
        #              ip * self.poisson_distr.pmf(ip))
        #         sr_probs_map[(SnakesAndLaddersState(0, beta1), reward)] = probability
        #         d[state] = Categorical(sr_probs_map)
        # return d

        d: Dict[SnakesAndLaddersState, Categorical[SnakesAndLaddersState]] = {}
        for pos in range(1,100+1):
            state = SnakesAndLaddersState(pos)
            sr_probs_map: Dict[Tuple[SnakesAndLaddersState, float], float] = {}
            for j in range(1, 6+1):
                next_state = SnakesAndLaddersState(snake_or_ladder(pos + j))
                probs = end_game_probs(pos, snake_or_ladder(pos + j))
                reward = 1.0 if pos != 100 else 0.0
                sr_probs_map[(next_state, reward)] = probs
            d[state] = Categorical(sr_probs_map)
        return d