Python Categorical 예제들, rl.distribution.Categorical Python 예제들

예제 #1

0

파일 보기

    def __init__(
        self,
        transition_reward_map: Mapping[S, FiniteDistribution[Tuple[S,
                                                                   float]]]):
        transition_map: Dict[S, FiniteDistribution[S]] = {}

        for state, trans in transition_reward_map.items():
            probabilities: Dict[S, float] = defaultdict(float)
            for (next_state, _), probability in trans:
                probabilities[next_state] += probability

            transition_map[state] = Categorical(probabilities)

        super().__init__(transition_map)

        nt: Set[S] = set(transition_reward_map.keys())
        self.transition_reward_map = {
            NonTerminal(s):
            Categorical({(NonTerminal(s1) if s1 in nt else Terminal(s1), r): p
                         for (s1, r), p in v.table().items()})
            for s, v in transition_reward_map.items()
        }

        self.reward_function_vec = np.array([
            sum(probability * reward
                for (_,
                     reward), probability in self.transition_reward_map[state])
            for state in self.non_terminal_states
        ])

예제 #2

0

파일 보기

파일: prob_4.2.py 프로젝트: lkourti/RL-book

    def get_action_transition_reward_map(self) -> LilypadSoundMapping:
        d: Dict[LilypadState, Dict[str, Categorical[Tuple[LilypadState,
                                                            float]]]] = {}
        for lilypad in range(1,self.num_lilypads-1):
            state: LilypadState = LilypadState(lilypad)
            d1: Dict[str,Categorical[Tuple[LilypadState, float]]] = {}

            # sound A
            sr_probs_dict_a: Dict[Tuple[LilypadState, float], float] = {}
            sr_probs_dict_a[(LilypadState(lilypad - 1), 0)] = lilypad / (self.num_lilypads-1)
            if lilypad + 1 == self.num_lilypads-1:
                sr_probs_dict_a[(LilypadState(lilypad + 1), 1)] = 1 - lilypad/(self.num_lilypads-1)
            else:
                sr_probs_dict_a[(LilypadState(lilypad + 1), 0)] = 1 - lilypad /(self.num_lilypads-1)
            d1['A'] = Categorical(sr_probs_dict_a)

            # sound B
            sr_probs_dict_b: Dict[Tuple[LilypadState, float], float] = {}
            for i in range(self.num_lilypads):
                if i != lilypad:
                    if i == self.num_lilypads - 1:
                        sr_probs_dict_b[(LilypadState(i), 1)] = 1/(self.num_lilypads-1)
                    else:
                        sr_probs_dict_b[(LilypadState(i), 0)] = 1 /(self.num_lilypads-1)
            d1['B'] = Categorical(sr_probs_dict_b)

            d[state] = d1
        return d

예제 #3

0

파일 보기

파일: frogcroak.py 프로젝트: GYY7/RL-book

    def get_action_transition_reward_map(self) -> FrogCroakMapping:
        d: Dict[FrogState, Dict[Any, Categorical[Tuple[FrogState,
                                                       float]]]] = {}

        for i in range(self.num_pads):
            state = FrogState(i)
            if i == 0 or i == self.num_pads - 1:
                d[state] = None
            else:
                d1: Dict[Any, Categorical[Tuple[FrogState, float]]] = {}
                _reward = 0
                if i == self.num_pads - 2:
                    _reward = 1 - i / (self.num_pads - 1)
                prob_dict_a: Dict[Tuple[FrogState, float], float] = {
                    (FrogState(i + 1), 0): 1 - i / (self.num_pads - 1),
                    (FrogState(i - 1), _reward): i / (self.num_pads - 1)
                }
                d1['A'] = Categorical(prob_dict_a)
                prob_dict_b: Dict[Tuple[FrogState, float], float] = {
                    (FrogState(j), 0): 1 / (self.num_pads - 1)
                    for j in range(self.num_pads)
                    if j != i or j != self.num_pads - 2
                }
                prob_dict_b[(FrogState(self.num_pads - 2), 1 /
                             (self.num_pads - 1))] = 1 / (self.num_pads - 1)
                d1['B'] = Categorical(prob_dict_b)
                d[state] = d1
        return d

예제 #4

0

파일 보기

파일: frog_escape.py 프로젝트: thowell/RL-book

    def get_action_transition_reward_map(self) -> PadMapping:
        d: Dict[FrogEscapeState, Dict[int, Categorical[Tuple[FrogEscapeState,
                                                             float]]]] = {}

        # 0 state
        state0: FrogEscapeState = FrogEscapeState(0)
        d[state0] = None

        for i in range(1, self.n):
            state: FrogEscapeState = FrogEscapeState(i)
            di: Dict[int, Categorical[Tuple[FrogEscapeState, float]]] = {}

            # action A
            sr_probs_dict_A: Dict[Tuple[FrogEscapeState, float], float] =\
                        {(FrogEscapeState(i-1), 0.0 if i-1 != 0 else 0.0): i/self.n, (FrogEscapeState(i+1), 0.0 if i+1 != self.n else 1.0): (self.n - i)/self.n}

            di[0] = Categorical(sr_probs_dict_A)

            # action B
            sr_probs_dict_B: Dict[Tuple[FrogEscapeState, float], float] =\
                        {(FrogEscapeState(j), 0.0 if j == 0 else (1.0 if j == self.n else 0.0)): 1/self.n if j != i else 0.0 for j in range(self.n+1)}

            di[1] = Categorical(sr_probs_dict_B)

            # add actions
            d[state] = di

        # n state
        staten: FrogEscapeState = FrogEscapeState(self.n)
        d[staten] = None

        return d

예제 #5

0

파일 보기

파일: Q1.py 프로젝트: leorebensabath/RL-book

 def get_transition_probabilities(self, nt_state: Cell) \
         -> Mapping[Move, Categorical[Tuple[Cell, float]]]:
     '''
     given a non-terminal state, return a dictionary whose
     keys are the valid actions (moves) from the given state
     and the corresponding values are the associated probabilities
     (following that move) of the (next_state, reward) pairs.
     The probabilities are determined from the wind probabilities
     of the column one is in after the move. Note that if one moves
     to a goal cell (terminal state), then one ends up in that
     goal cell with 100% probability (i.e., no wind exposure in a
     goal cell).
     '''
     d: Dict[Move, Categorical[Tuple[Cell, float]]] = {}
     for a, (r, c) in self.get_actions_and_next_states(nt_state):
         if (r, c) in self.terminals:
             d[a] = Categorical({((r, c), -1.): 1.})
         else:
             p1, p2 = self.wind[c][0], self.wind[c][1]
             dist={}
             dist[((r,c), -1)] = 1 - p1 - p2
             dist[((r,c), -(1+self.bump_cost))] = p1*(((r-1,c) in self.blocks)or(r==0)) + p2*(((r+1,c) in self.blocks)or(r==self.rows-1))
             dist[((r-1,c), -1)] = p1*(not(((r-1,c) in self.blocks)or(r==0)))
             dist[((r+1,c), -1)] = p2*(not(((r+1,c) in self.blocks)or(r==self.rows-1)))
             d[a] = Categorical(dist)
     return d

예제 #6

0

파일 보기

파일: windy_grid.py 프로젝트: smallGum/RL-book

 def get_transition_probabilities(self, nt_state: Cell) \
         -> Mapping[Move, Categorical[Tuple[Cell, float]]]:
     '''
     given a non-terminal state, return a dictionary whose
     keys are the valid actions (moves) from the given state
     and the corresponding values are the associated probabilities
     (following that move) of the (next_state, reward) pairs.
     The probabilities are determined from the wind probabilities
     of the column one is in after the move. Note that if one moves
     to a goal cell (terminal state), then one ends up in that
     goal cell with 100% probability (i.e., no wind exposure in a
     goal cell).
     '''
     d: Dict[Move, Categorical[Tuple[Cell, float]]] = {}
     for a, (r, c) in self.get_actions_and_next_states(nt_state):
         if (r, c) in self.terminals:
             d[a] = Categorical({((r, c), -1.): 1.})
         else:
             down_prob, up_prob = self.wind[c]
             stay_prob: float = 1. - down_prob - up_prob
             d1: Dict[Tuple[Cell, float], float] = \
                 {((r, c), -1.): stay_prob}
             if self.is_valid_state((r - 1, c)):
                 d1[((r - 1, c), -1.)] = down_prob
             if self.is_valid_state((r + 1, c)):
                 d1[((r + 1, c), -1.)] = up_prob
             d1[((r, c), -1. - self.bump_cost)] = \
                 down_prob * (1 - self.is_valid_state((r - 1, c))) + \
                 up_prob * (1 - self.is_valid_state((r + 1, c)))
             d[a] = Categorical(d1)
     return d

예제 #7

0

파일 보기

파일: test_td.py 프로젝트: matteosantama/RL-book

    def setUp(self):
        random.seed(42)

        self.finite_flip_flop = FlipFlop(0.7)

        self.finite_mdp = FiniteMarkovDecisionProcess({
            True: {
                True: Categorical({
                    (True, 1.0): 0.7,
                    (False, 2.0): 0.3
                }),
                False: Categorical({
                    (True, 1.0): 0.3,
                    (False, 2.0): 0.7
                }),
            },
            False: {
                True: Categorical({
                    (False, 1.0): 0.7,
                    (True, 2.0): 0.3
                }),
                False: Categorical({
                    (False, 1.0): 0.3,
                    (True, 2.0): 0.7
                }),
            },
        })

예제 #8

0

파일 보기

파일: prob2.py 프로젝트: lkourti/RL-book

 def get_transition_map(self) -> Transition[BlockState]:
     d: Dict[BlockState, Categorical[BlockState]] = {}
     for i in range(self.board_size - self.dice_size + 1):
         state_probs_map: Mapping[BlockState, float] = {}
         #{BlockState(self.sl_mapping[j]): 1/self.dice_size for j in range(i+1, i+self.dice_size+1)}
         for j in range(i + 1, i + self.dice_size):
             if BlockState(self.sl_mapping[j]) not in state_probs_map:
                 state_probs_map[BlockState(
                     self.sl_mapping[j])] = 1 / self.dice_size
             else:
                 state_probs_map[BlockState(
                     self.sl_mapping[j])] += 1 / self.dice_size
         d[BlockState(i)] = Categorical(state_probs_map)
     for i in range(self.board_size - self.dice_size + 1, self.board_size):
         state_probs_map: Mapping[BlockState, float] = {}
         #{BlockState(self.sl_mapping[j]): 1/self.dice_size for j in range(i+1, self.board_size+1)}
         for j in range(i + 1, self.board_size + 1):
             if BlockState(self.sl_mapping[j]) not in state_probs_map:
                 state_probs_map[BlockState(
                     self.sl_mapping[j])] = 1 / self.dice_size
             else:
                 state_probs_map[BlockState(
                     self.sl_mapping[j])] += 1 / self.dice_size
         state_probs_map[BlockState(
             i)] = (self.dice_size - (self.board_size - i)) / self.dice_size
         d[BlockState(i)] = Categorical(state_probs_map)
     d[BlockState(self.board_size)] = None
     return d

예제 #9

0

파일 보기

def sarsa_control_scratch(
        #traces: Iterable[Iterable[mp.TransitionStep[S]]],
        mdp_to_sample: FiniteMarkovDecisionProcess,
        states: List[S],
        actions: Mapping[S, List[A]],
        γ: float,
        num_episodes: float = 10000,
        eps: float = 0.1,
        base_lr: float = 0.03,
        half_life: float = 1000.0,
        exponent: float = 0.5) -> Mapping[S, float]:

    q: Mapping[Tuple[S, A], float] = {}
    counts_per_state_act: Mapping[Tuple[S, A], int] = {}
    for state in states:
        for action in actions[state]:
            q[(state, action)] = 0.
            counts_per_state_act[(state, action)] = 0
    policy_map: Mapping[S, Optional[Categorical[A]]] = {}
    for state in states:
        if actions[state] is None:
            policy_map[state] = None
        else:
            policy_map[state] = Categorical(
                {action: 1
                 for action in actions[state]})
    Pi: FinitePolicy[S, A] = FinitePolicy(policy_map)
    state = Categorical({state: 1 for state in states}).sample()
    for i in range(num_episodes):
        action_distribution = Pi.act(state)
        action = action_distribution.sample()
        next_distribution = mdp_to_sample.step(state, action)
        next_state, reward = next_distribution.sample()
        next_action = Pi.act(next_state).sample()
        counts_per_state_act[(state, action)] += 1
        alpha = base_lr / (1 + (
            (counts_per_state_act[(state, action)] - 1) / half_life)**exponent)
        #We choose the next action based on epsilon greedy policy
        q[(state,
           action)] += alpha * (reward + γ * q[(next_state, next_action)] -
                                q[(state, action)])
        new_pol: Mapping[S, Optional[Categorical[A]]] = Pi.policy_map
        if actions[state] is None:
            new_pol[state] = None
        policy_map = {
            action: eps / len(actions[state])
            for action in actions[state]
        }
        best_action = actions[state][0]
        for action in actions[state]:
            if q[(state, best_action)] <= q[(state, action)]:
                best_action = action
        policy_map[best_action] += 1 - eps
        new_pol[state] = Categorical(policy_map)
        Pi = FinitePolicy(new_pol)
        state = next_state
        if next_state is None:
            state = Categorical({state: 1 for state in states}).sample()
    return q

예제 #10

0

파일 보기

파일: test_finite_horizon.py 프로젝트: shenoy1/RL-book

    def test_unwrap_finite_horizon_MDP(self):
        finite = finite_horizon_MDP(self.finite_flip_flop, 10)
        unwrapped = unwrap_finite_horizon_MDP(finite)

        self.assertEqual(len(unwrapped), 10)

        def action_mapping_for(s: WithTime[bool]) -> \
                ActionMapping[bool, WithTime[bool]]:
            same = NonTerminal(s.step_time())
            different = NonTerminal(dataclasses.replace(
                s.step_time(),
                state=not s.state
            ))

            return {
                True: Categorical({
                    (same, 1.0): 0.7,
                    (different, 2.0): 0.3
                }),
                False: Categorical({
                    (same, 1.0): 0.3,
                    (different, 2.0): 0.7
                })
            }

        for t in range(9):
            for s in True, False:
                s_time = WithTime(state=s, time=t)
                for a in True, False:
                    distribution.assert_almost_equal(
                        self,
                        finite.mapping[NonTerminal(s_time)][a],
                        action_mapping_for(s_time)[a]
                    )

        for s in True, False:
            s_time = WithTime(state=s, time=9)
            same = Terminal(s_time.step_time())
            different = Terminal(dataclasses.replace(
                s_time.step_time(),
                state=not s_time.state
            ))
            act_map = {
                True: Categorical({
                    (same, 1.0): 0.7,
                    (different, 2.0): 0.3
                }),
                False: Categorical({
                    (same, 1.0): 0.3,
                    (different, 2.0): 0.7
                })

            }
            for a in True, False:
                distribution.assert_almost_equal(
                    self,
                    finite.mapping[NonTerminal(s_time)][a],
                    act_map[a]
                )

예제 #11

0

파일 보기

 def get_mapping(self) -> StateActionMapping[State, Action]:
     #We need to define the StateActionMapping for this Finite MDP
     mapping: StateActionMapping[State, Action] = {}
     list_actions: List[Action] = []
     #We start by defining all the available actions
     for i in range(self.H + 1):
         range_j = self.H - i
         for j in range(range_j + 1):
             list_actions.append(Action(i, j))
     self.list_actions: List[Action] = list_actions
     list_states: List[State] = []
     #Then we define all the possible states
     for i in range(1, self.W + 1):
         list_states.append(State(i))
     self.list_states: List[State] = list_states
     for state in list_states:
         submapping: ActionMapping[Action, StateReward[State]] = {}
         for action in list_actions:
             s: int = action.s
             l: int = action.l
             reward: float = state.wage * (self.H - l - s)
             pois_mean: float = self.alpha * l
             proba_offer: float = self.beta * s / self.H
             if state.wage == self.W:
                 #If you're in state W, you stay in state W with constant
                 #Probability. The reward only depends on the action you
                 #you have chosen
                 submapping[action] = Constant((state, reward))
             elif state.wage == self.W - 1:
                 #If you're in state W-1, you can either stay in your state
                 #or land in state W
                 submapping[action] = Categorical({
                     (state,
                      reward):
                         poisson.pmf(0,pois_mean)*(1-proba_offer),
                      (State(self.W),
                       reward):proba_offer+(1-proba_offer)*\
                          (1-poisson.pmf(0,pois_mean))
                     })
             else:
                 #If you're in any other state, you can land to any state
                 #Between your current state and W with probabilities
                 #as described before
                 dic_distrib = {}
                 dic_distrib[(state, reward)] = poisson.pmf(
                     0, pois_mean) * (1 - proba_offer)
                 dic_distrib[
                     (State(state.wage+1),
                      reward)] = proba_offer*poisson.cdf(1,pois_mean)\
                             +(1-proba_offer)*poisson.pmf(1,pois_mean)
                 for k in range(2, self.W - state.wage):
                     dic_distrib[(State(state.wage + k),
                                  reward)] = poisson.pmf(k, pois_mean)
                 dic_distrib[(State(self.W), reward)] = 1 - poisson.cdf(
                     self.W - state.wage - 1, pois_mean)
                 submapping[action] = Categorical(dic_distrib)
         mapping[state] = submapping
     return mapping

예제 #12

0

파일 보기

def mc_control_scratch(
        #traces: Iterable[Iterable[mp.TransitionStep[S]]],
        mdp_to_sample: FiniteMarkovDecisionProcess,
        states: List[S],
        actions: Mapping[S, List[A]],
        γ: float,
        tolerance: float = 1e-6,
        num_episodes: float = 10000) -> Mapping[Tuple[S, A], float]:

    q: Mapping[Tuple[S, A], float] = {}
    counts_per_state_act: Mapping[Tuple[S, A], int] = {}
    for state in states:
        for action in actions[state]:
            q[(state, action)] = 0.
            counts_per_state_act[(state, action)] = 0
    policy_map: Mapping[S, Optional[Categorical[A]]] = {}
    for state in states:
        if actions[state] is None:
            policy_map[state] = None
        else:
            policy_map[state] = Categorical(
                {action: 1
                 for action in actions[state]})
    Pi: FinitePolicy[S, A] = FinitePolicy(policy_map)
    start_state_distrib = Categorical({state: 1 for state in states})
    for i in range(num_episodes):
        trace: Iterable[TransitionStep[S, A]] = mdp_to_sample.simulate_actions(
            start_state_distrib, Pi)
        episode = returns(trace, γ, tolerance)
        #print(episode)
        for step in episode:
            state = step.state
            action = step.action
            return_ = step.return_
            counts_per_state_act[(state, action)] += 1
            q[(state, action)] += 1 / counts_per_state_act[
                (state, action)] * (return_ - q[(state, action)])
        eps = 1 / (i + 1)
        new_pol: Mapping[S, Optional[Categorical[A]]] = {}
        for state in states:
            if actions[state] is None:
                new_pol[state] = None
            policy_map = {
                action: eps / len(actions[state])
                for action in actions[state]
            }
            best_action = actions[state][0]
            for action in actions[state]:
                if q[(state, best_action)] <= q[(state, action)]:
                    best_action = action
            policy_map[best_action] += 1 - eps
            new_pol[state] = Categorical(policy_map)
        Pi = FinitePolicy(new_pol)

    return q

예제 #13

0

파일 보기

파일: test_finite_horizon.py 프로젝트: shenoy1/RL-book

 def transition_for(_):
     return {
         True: Categorical({
             (NonTerminal(True), 1.0): 0.3,
             (NonTerminal(False), 2.0): 0.7
         }),
         False: Categorical({
             (NonTerminal(True), 2.0): 0.7,
             (NonTerminal(False), 1.0): 0.3
         })
     }

예제 #14

0

파일 보기

파일: test_finite_horizon.py 프로젝트: zarfer007/RL-book

 def transition_for(time):
     return {
         True: Categorical({
             (True, 1.0): 0.3,
             (False, 2.0): 0.7,
         }),
         False: Categorical({
             (True, 2.0): 0.7,
             (False, 1.0): 0.3,
         })
     }

예제 #15

0

파일 보기

파일: test_finite_horizon.py 프로젝트: shenoy1/RL-book

 def setUp(self):
     self.finite_flip_flop = FiniteMarkovDecisionProcess({
         True: {
             True: Categorical({(True, 1.0): 0.7, (False, 2.0): 0.3}),
             False: Categorical({(True, 1.0): 0.3, (False, 2.0): 0.7}),
         },
         False: {
             True: Categorical({(False, 1.0): 0.7, (True, 2.0): 0.3}),
             False: Categorical({(False, 1.0): 0.3, (True, 2.0): 0.7}),
         }
     })

예제 #16

0

파일 보기

파일: test_finite_horizon.py 프로젝트: zarfer007/RL-book

        def action_mapping_for(
                s: WithTime[bool]) -> ActionMapping[bool, WithTime[bool]]:
            same = s.step_time()
            different = dataclasses.replace(s.step_time(), state=not s.state)

            return {
                True: Categorical({
                    (same, 1.0): 0.7,
                    (different, 2.0): 0.3
                }),
                False: Categorical({
                    (same, 1.0): 0.3,
                    (different, 2.0): 0.7
                })
            }

예제 #17

0

파일 보기

파일: optimal_exercise_bin_tree.py 프로젝트: zarfer007/RL-book

 def get_opt_vf_and_policy(self) -> \
         Iterator[Tuple[V[int], FinitePolicy[int, bool]]]:
     dt: float = self.dt()
     up_factor: float = np.exp(self.vol * np.sqrt(dt))
     up_prob: float = (np.exp(self.rate * dt) * up_factor - 1) / \
         (up_factor * up_factor - 1)
     return optimal_vf_and_policy(
         steps=[
             {j: None if j == -1 else {
                 True: Constant(
                     (
                         -1,
                         self.payoff(i * dt, self.state_price(i, j))
                     )
                 ),
                 False: Categorical(
                     {
                         (j + 1, 0.): up_prob,
                         (j, 0.): 1 - up_prob
                     }
                 )
             } for j in range(i + 1)}
             for i in range(self.num_steps + 1)
         ],
         gamma=np.exp(-self.rate * dt)
     )

예제 #18

0

파일 보기

파일: test_distribution.py 프로젝트: shenoy1/RL-book

class TestFiniteDistribution(unittest.TestCase):
    def setUp(self):
        self.die = Choose({1, 2, 3, 4, 5, 6})

        self.ragged = Categorical({0: 0.9, 1: 0.05, 2: 0.025, 3: 0.025})

    def test_map(self):
        plusOne = self.die.map(lambda x: x + 1)
        assert_almost_equal(self, plusOne, Choose({2, 3, 4, 5, 6, 7}))

        evenOdd = self.die.map(lambda x: x % 2 == 0)
        assert_almost_equal(self, evenOdd, Choose({True, False}))

        greaterThan4 = self.die.map(lambda x: x > 4)
        assert_almost_equal(self, greaterThan4,
                            Categorical({
                                True: 1 / 3,
                                False: 2 / 3
                            }))

    def test_expectation(self):
        self.assertAlmostEqual(self.die.expectation(float), 3.5)

        even = self.die.map(lambda n: n % 2 == 0)
        self.assertAlmostEqual(even.expectation(float), 0.5)

        self.assertAlmostEqual(self.ragged.expectation(float), 0.175)

예제 #19

0

파일 보기

파일: test_distribution.py 프로젝트: shenoy1/RL-book

    def test_constant(self):
        assert_almost_equal(self, self.fair,
                            Categorical({
                                True: 0.5,
                                False: 0.5
                            }))
        self.assertAlmostEqual(self.fair.probability(True), 0.5)
        self.assertAlmostEqual(self.fair.probability(False), 0.5)

        assert_almost_equal(self, self.unfair,
                            Categorical({
                                True: 0.3,
                                False: 0.7
                            }))
        self.assertAlmostEqual(self.unfair.probability(True), 0.3)
        self.assertAlmostEqual(self.unfair.probability(False), 0.7)

예제 #20

0

파일 보기

파일: stock_price_mp.py 프로젝트: zarfer007/RL-book

    def transition(self, state: StateMP2) -> Categorical[StateMP2]:
        up_p = self.up_prob(state)

        return Categorical({
            StateMP2(state.price + 1, True): up_p,
            StateMP2(state.price - 1, False): 1 - up_p
        })

예제 #21

0

파일 보기

파일: stock_price_mp.py 프로젝트: zarfer007/RL-book

    def transition(self, state: StateMP1) -> Categorical[StateMP1]:
        up_p = self.up_prob(state)

        return Categorical({
            StateMP1(state.price + 1): up_p,
            StateMP1(state.price - 1): 1 - up_p
        })

예제 #22

0

파일 보기

파일: Frog_Escape.py 프로젝트: JosephWakim/RL-book

    def get_action_transition_reward_map(self) -> FrogCroakMapping:
        """Get the mapping from states and actions to rewards and new states.
        """
        d: Dict[FrogState, Dict[Croak,
                                Categorical[Tuple[FrogState, float]]]] = {
                                    FrogState(self.river.n_lily): None,
                                    FrogState(0): None
                                }

        for position in range(1, self.river.n_lily):
            state: FrogState = FrogState(position)
            d1: Dict[Croak, Categorical[Tuple[FrogState, float]]] = {}

            for croak_A in (True, False):
                sr_probs_dict: Dict[Tuple[FrogState, float], float] =\
                    {
                        (FrogState(next_p), self.reward_at_position(next_p)):
                        self.get_transit_prob_from_croak(
                            state, FrogState(next_p), croak_A
                        ) for next_p in range(self.river.n_lily+1)
                    }
                d1[croak_A] = Categorical(sr_probs_dict)

            d[state] = d1
        return d

예제 #23

0

파일 보기

파일: simple_inventory_mdp_cap.py 프로젝트: zarfer007/RL-book

    def get_action_transition_reward_map(self) -> InvOrderMapping:
        d: Dict[InventoryState, Dict[int, Categorical[Tuple[InventoryState,
                                                            float]]]] = {}

        for alpha in range(self.capacity + 1):
            for beta in range(self.capacity + 1 - alpha):
                state: InventoryState = InventoryState(alpha, beta)
                ip: int = state.inventory_position()
                base_reward: float = -self.holding_cost * alpha
                d1: Dict[int, Categorical[Tuple[InventoryState, float]]] = {}

                for order in range(self.capacity - ip + 1):
                    sr_probs_dict: Dict[Tuple[InventoryState, float], float] =\
                        {(InventoryState(ip - i, order), base_reward):
                         self.poisson_distr.pmf(i) for i in range(ip)}

                    probability: float = 1 - self.poisson_distr.cdf(ip - 1)
                    reward: float = base_reward - self.stockout_cost *\
                        (probability * (self.poisson_lambda - ip) +
                         ip * self.poisson_distr.pmf(ip))
                    sr_probs_dict[(InventoryState(0, order), reward)] = \
                        probability
                    d1[order] = Categorical(sr_probs_dict)

                d[state] = d1
        return d

예제 #24

0

파일 보기

파일: assignment2_code.py 프로젝트: sogipec/RL-book

 def get_transition_reward_map(
         self) -> RewardTransition[StateSnakeAndLadder]:
     d: Dict[StateSnakeAndLadder, Categorical[Tuple[StateSnakeAndLadder,
                                                    float]]] = {}
     dic_mapping = {}
     for i in self.grid:
         dic_mapping[i.start] = i.end
     for i in range(1, self.grid_size):
         state = StateSnakeAndLadder(position=i)
         dic_positions_associated: dict = {}
         for j in range(i + 1, i + 7):
             if (dic_mapping.get(j)) is not None:
                 new_pos = dic_mapping[j]
             else:
                 new_pos = j
             if new_pos > self.grid_size:
                 new_pos = self.grid_size
             if new_pos in dic_positions_associated.keys():
                 dic_positions_associated[new_pos] += 1
             else:
                 dic_positions_associated[new_pos] = 1
         sr_probs_map: Dict[Tuple[StateSnakeAndLadder, float], float] = \
             {(StateSnakeAndLadder(position = j),1) : dic_positions_associated[j]/6 for j in dic_positions_associated.keys()}
         d[state] = Categorical(sr_probs_map)
     d[StateSnakeAndLadder(position=self.grid_size)] = None
     return d

예제 #25

0

파일 보기

파일: gradient_bandits.py 프로젝트: shenoy1/RL-book

    def get_episode_rewards_actions(self) -> Tuple[ndarray, ndarray]:
        ep_rewards: ndarray = empty(self.time_steps)
        ep_actions: ndarray = empty(self.time_steps, dtype=int)
        scores: List[float] = [0.] * self.num_arms
        avg_reward: float = 0.

        for i in range(self.time_steps):
            max_score: float = max(scores)
            exp_scores: Sequence[float] = [exp(s - max_score) for s in scores]
            sum_exp_scores = sum(exp_scores)
            probs: Sequence[float] = [s / sum_exp_scores for s in exp_scores]
            action: int = Categorical(
                {i: p for i, p in enumerate(probs)}
            ).sample()
            reward: float = self.arm_distributions[action].sample()
            avg_reward += (reward - avg_reward) / (i + 1)
            step_size: float = self.learning_rate *\
                (i / self.learning_rate_decay + 1) ** -0.5
            for j in range(self.num_arms):
                scores[j] += step_size * (reward - avg_reward) *\
                             ((1 if j == action else 0) - probs[j])

            ep_rewards[i] = reward
            ep_actions[i] = action
        return ep_rewards, ep_actions

예제 #26

0

파일 보기

파일: test_finite_horizon.py 프로젝트: shenoy1/RL-book

    def test_finite_horizon_MRP(self):
        finite = finite_horizon_MRP(self.finite_flip_flop, 10)

        trues = [NonTerminal(WithTime(True, time)) for time in range(10)]
        falses = [NonTerminal(WithTime(False, time)) for time in range(10)]
        non_terminal_states = set(trues + falses)
        self.assertEqual(set(finite.non_terminal_states), non_terminal_states)

        expected_transition = {}
        for state in non_terminal_states:
            t: int = state.state.time
            st: bool = state.state.state
            if t < 9:
                prob = {
                    (NonTerminal(WithTime(st, t + 1)), 1.0): 0.3,
                    (NonTerminal(WithTime(not st, t + 1)), 2.0): 0.7
                }
            else:
                prob = {
                    (Terminal(WithTime(st, t + 1)), 1.0): 0.3,
                    (Terminal(WithTime(not st, t + 1)), 2.0): 0.7
                }

            expected_transition[state] = Categorical(prob)

        for state in non_terminal_states:
            distribution.assert_almost_equal(
                self,
                finite.transition_reward(state),
                expected_transition[state])

예제 #27

0

파일 보기

파일: Q3.py 프로젝트: leorebensabath/RL-book

    def get_action_transition_reward_map(
            self) -> StateActionMapping[int, DaySplit]:

        d: Dict[int, Dict[DaySplit, Categorical[Tuple[int, int]]]] = {}

        for w in range(1, self.W + 1):
            actionMap: Dict[DaySplit, Categorical[Tuple[int, int]]] = {}
            for l in range(self.H + 1):
                for s in range(self.H + 1 - l):
                    action = DaySplit(l, s, self.H - l - s)
                    probDict = {}
                    probDict[(w, action.w *
                              w)] = poisson(self.alpha * action.l, 0) * (
                                  1 - self.beta * action.s / self.H)
                    probDict[(min(
                        self.W, w + 1), action.w * w)] = probDict.get(
                            (min(self.W, w + 1), action.w * w), 0) + poisson(
                                self.alpha * action.l,
                                1) + poisson(self.alpha * action.l, 0) * (
                                    self.beta * action.s / self.H)
                    for k in range(2, self.W - w):
                        probDict[(w + k, action.w * w)] = poisson(
                            self.alpha * action.l, k)
                    probDict[(self.W, action.w * w)] = probDict.get(
                        (self.W, action.w * w), 0) + 1 - sum(probDict.values())
                    actionMap[action] = Categorical(probDict)
            d[w] = actionMap
        return (d)

예제 #28

0

파일 보기

파일: test_finite_horizon.py 프로젝트: zarfer007/RL-book

    def test_finite_horizon_MRP(self):
        finite = finite_horizon_MRP(self.finite_flip_flop, 10)

        trues = [WithTime(True, time) for time in range(0, 10)]
        falses = [WithTime(False, time) for time in range(0, 10)]
        non_terminal_states = set(trues + falses)
        terminal_states = {WithTime(True, 10), WithTime(False, 10)}
        expected_states = non_terminal_states.union(terminal_states)

        self.assertEqual(set(finite.states()), expected_states)

        expected_transition = {}
        for state in non_terminal_states:
            expected_transition[state] =\
                Categorical({
                    (WithTime(state.state, state.time + 1), 1.0): 0.3,
                    (WithTime(not state.state, state.time + 1), 2.0): 0.7
                })

        for state in non_terminal_states:
            distribution.assert_almost_equal(self,
                                             finite.transition_reward(state),
                                             expected_transition[state])

        for state in terminal_states:
            self.assertEqual(finite.transition(state), None)

예제 #29

0

파일 보기

    def get_transition_map(self) -> mp.Transition[mp.S]:
        """
        Get the transition map associated with the playing board.

        The transition map associated with each move is dictated by the uniform
        probability of rolling each value on the die, the landing position
        associated with each possible roll, the presence of a snake or ladder
        at the landing position, and proximity to the end of the board.

        A `GameState` object is instantiated for each tile position on the
        board. That object is mapped to N subsequent positions with uniform
        probability, where N is the number of sides on the playing die.
        Corrections are made to the position attribute of each `GameState` 
        object if the position lies beyond the end of the board or corresponds
        to snakes or ladders.

        :returns: Transition map representing "Snakes and Ladders" board
        """
        d: Dict[GameState, Optional[Categorical[GameState]]] = {
            GameState(self.num_tiles) : None
        }
        for position in range(self.num_tiles):
            game_state = GameState(position)
            game_state_prob_map: Mapping[GameState, float] = {
                GameState(i).check_SL(
                    self.snakes_and_ladders
                ).check_board_end(
                    self.num_tiles, i-position
                ) : self.uniform_distr for i in range(
                    position+1,
                    position+self.dice_sides+1
                )
            }
            d[game_state] = Categorical(game_state_prob_map)
        return d

예제 #30

0

파일 보기

    def get_transition_reward_map(self) -> RewardTransition[SnakesAndLaddersState]:
        d: Dict[SnakesAndLaddersState, Categorical[Tuple[SnakesAndLaddersState, float]]] = {}
        # for alpha in range(self.capacity + 1):
        #     for beta in range(self.capacity + 1 - alpha):
        #         state = SnakesAndLaddersState(alpha, beta)
        #         ip = state.position()
        #         beta1 = self.capacity - ip
        #         base_reward = - self.holding_cost * state.on_hand
        #         sr_probs_map: Dict[Tuple[SnakesAndLaddersState, float], float] =\
        #             {(SnakesAndLaddersState(ip - i, beta1), base_reward):
        #              self.poisson_distr.pmf(i) for i in range(ip)}
        #         probability = 1 - self.poisson_distr.cdf(ip - 1)
        #         reward = base_reward - self.stockout_cost *\
        #             (probability * (self.poisson_lambda - ip) +
        #              ip * self.poisson_distr.pmf(ip))
        #         sr_probs_map[(SnakesAndLaddersState(0, beta1), reward)] = probability
        #         d[state] = Categorical(sr_probs_map)
        # return d

        d: Dict[SnakesAndLaddersState, Categorical[SnakesAndLaddersState]] = {}
        for pos in range(1,100+1):
            state = SnakesAndLaddersState(pos)
            sr_probs_map: Dict[Tuple[SnakesAndLaddersState, float], float] = {}
            for j in range(1, 6+1):
                next_state = SnakesAndLaddersState(snake_or_ladder(pos + j))
                probs = end_game_probs(pos, snake_or_ladder(pos + j))
                reward = 1.0 if pos != 100 else 0.0
                sr_probs_map[(next_state, reward)] = probs
            d[state] = Categorical(sr_probs_map)
        return d