Пример #1
0
 def set_transition(self, s_hash, a_desc, 
                    snext_hash, reward_obj=Reward(const=0.0), 
                    action_prob=1.0, trans_prob=1.0):
     """
     Will Set or Add prob and Reward entries to sn_probD and sn_rewardD
     
     action_prob controls the probability of picking an action from a list of actions.
     i.e. if in state s, there can be a list of (a1,p1), (a2,p2), (a3,p3), etc.
     
     trans_prob controls the probability of picking next state from a list of next states.
     i.e. if taking action a in state s, there can be a list of (sn1,p1), (sn2,p2), (sn3,p3), etc.
 
     Rewards can vary in a stochastic environment.
     Reward objects can give constant, weighted tabular, or function-based float reward values.
     
     The Reward object is always associated with (s,a,sn), however, the numerical value can vary 
     with probabilty distributions of its own.
     
     DELAY NORMALIZING... Allows sequential adding of multipls action/prob pairs.
        i.e. Merely set the flag "is_normalized" to False 
             to trigger later "normalize" call.
     """
     self.sa_coll.set_action_prob( s_hash, a_desc, prob=action_prob)
     
     T = self.get_transition_obj( s_hash, a_desc )
     Sn = self.state_coll.get_state_obj( snext_hash )
     
     T.set_transition( Sn, reward_obj=reward_obj, prob=trans_prob)
Пример #2
0
 def add_to_environment(self, env):
     """Populate an environment object with the collected data about s_hash"""
     if not self.is_consistent_info():
         print( 'WARNING... NOT CONSISTENT. '*3 )
         
     if self.total_action_calls == 0:
         print('WARNING... No Available ModelStateData to send to Environment')
     else:
     
         for (a_desc, a_count) in self.action_countD.items():
             if a_count > 0:
                 # fraction of calls in s_hash using a_desc
                 a_prob = float(a_count) / float(self.total_action_calls)
                 
                 if a_desc in self.action_sn_rD:
                     snD = self.action_sn_rD[ a_desc ] # snD...  index=sn_hash: value=rwd_ave_obj
                     for sn_hash, rwd_ave_obj in snD.items():
             
                         # fraction of times using a_desc in s_hash resulted in sn_hash
                         t_prob = float(rwd_ave_obj.num_val) / float(a_count)
                         
                         env.TC.set_transition( self.s_hash, a_desc, 
                                                sn_hash, reward_obj=Reward(const=rwd_ave_obj.get_ave()), 
                                                action_prob=a_prob, trans_prob=t_prob)
         
         # make sure all normalize flags are set in env.TC
         for (s_hash, a_desc, T) in env.TC.iter_all_transitions():
             T.normalize()
Пример #3
0
 def add_event( s_hash, a_desc, sn_hash ):
     
     r = rewardD.get( sn_hash, 0.0)
         
     TC.set_transition( s_hash, a_desc,
                        sn_hash, reward_obj=Reward(const=r), 
                        action_prob=1.0, trans_prob=1.0)
Пример #4
0
    def add_transition(self, a_desc, snext_hash, t_prob, reward_obj):
        """Add the (sn,tp,R) triplet for the (s,a)"""

        # catch non-float t_prob
        if is_float(t_prob):
            t_prob = floatCast(t_prob)  # make sure it's a simple float
        else:
            raise ValueError('transition prob: "%s" MUST BE A FLOAT.' %
                             str(t_prob))
        # -----------

        # allow float inputs for reward... recast as Reward object
        if is_float(reward_obj):
            reward_obj = Reward(const=reward_obj)

        if not isinstance(reward_obj, Reward):
            raise ValueError(
                'reward_obj: "%s" MUST BE A Reward object OR float.' %
                str(reward_obj))
        # -----------

        if a_desc not in self.action_snprD:
            self.action_snprD[a_desc] = {
            }  # snD...  index=snext_hash: value=(t_prob, reward_obj)

        self.action_snprD[a_desc][snext_hash] = (t_prob, reward_obj)
Пример #5
0
    def setUp(self):
        unittest.TestCase.setUp(self)
        s = State((2, 2))
        a = Action('U')
        self.T = Transition(s, a)

        rc = Reward(const=1.1)
        reward_probL = [(0.0, 1), (1.0, 1), (2.0, 2)]
        rt = Reward(reward_probL=reward_probL)

        def my_gauss():
            return random.gauss(3.0, 0.5)

        rf = Reward(reward_dist_func=my_gauss)

        self.T.set_transition(State((2, 3)), reward_obj=rc, prob=0.8)
        self.T.set_transition(State((1, 2)), reward_obj=rt, prob=0.1)
        self.T.set_transition(State((3, 2)), reward_obj=rf, prob=0.1)
        self.T.set_transition(State((0, 0)), reward_obj=rf, prob=0.0)
Пример #6
0
    def add_transition(self,
                       s_hash,
                       a_desc,
                       snext_hash,
                       t_prob=1.0,
                       reward_obj=Reward(const=0.0)):
        if s_hash not in self.define_statesD:
            self.define_statesD[s_hash] = DefineStateMoves(s_hash)

        self.define_statesD[s_hash].add_transition(a_desc, snext_hash, t_prob,
                                                   reward_obj)
Пример #7
0
    def test_get_random_transition(self):
        """test get random transition"""

        # add another transition
        self.TC.set_transition((0, 0),
                               'R', (0, 3),
                               reward_obj=Reward(const=1.0),
                               action_prob=1.0,
                               trans_prob=1.0)
        snL = []
        for i in range(30):
            Sn = self.TC.get_prob_weighted_next_state((0, 0), 'R')
            snL.append(Sn.hash)

        self.assertGreater(snL.count((0, 1)), 5)  # should be 15
Пример #8
0
    def test_set_transition_prob(self):
        """test set transition prob"""
        # create a second possibility for (0,0), 'R'
        self.TC.set_transition((0, 0),
                               'R', (0, 3),
                               reward_obj=Reward(const=1.0),
                               action_prob=1.0,
                               trans_prob=1.0)

        Sn1, p1 = self.TC.get_next_state_prob((0, 0), 'R', (0, 1))
        Sn2, p2 = self.TC.get_next_state_prob((0, 0), 'R', (0, 3))
        self.assertEqual(p1, 0.5)
        self.assertEqual(p1, p2)
        self.assertNotEqual(Sn1, Sn2)

        # set explicitly
        self.TC.set_transition_prob((0, 0), 'R', (0, 1), prob=0.1)
        self.TC.set_transition_prob((0, 0), 'R', (0, 3), prob=0.9)

        Sn1, p1 = self.TC.get_next_state_prob((0, 0), 'R', (0, 1))
        Sn2, p2 = self.TC.get_next_state_prob((0, 0), 'R', (0, 3))
        self.assertEqual(p1, 0.1)
        self.assertEqual(p2, 0.9)

        # try setting sole prob
        self.TC.set_sole_transition((0, 0), 'R', (0, 1))
        Sn1, p1 = self.TC.get_next_state_prob((0, 0), 'R', (0, 1))
        Sn2, p2 = self.TC.get_next_state_prob((0, 0), 'R', (0, 3))
        self.assertEqual(p1, 1.0)
        self.assertEqual(p2, 0.0)

        # try sole random
        self.TC.initialize_sole_random((0, 0), 'R')
        Sn1, p1 = self.TC.get_next_state_prob((0, 0), 'R', (0, 1))
        Sn2, p2 = self.TC.get_next_state_prob((0, 0), 'R', (0, 3))
        pL = sorted([p1, p2])
        self.assertEqual(pL, [0.0, 1.0])

        # try equiprobable
        self.TC.intialize_to_equiprobable(
            (0, 0),
            'R',
        )
        Sn1, p1 = self.TC.get_next_state_prob((0, 0), 'R', (0, 1))
        Sn2, p2 = self.TC.get_next_state_prob((0, 0), 'R', (0, 3))
        self.assertEqual(p1, 0.5)
        self.assertEqual(p1, p2)
        self.assertNotEqual(Sn1, Sn2)
Пример #9
0
 def set_transition(self,
                    next_state_obj,
                    reward_obj=Reward(const=0.0),
                    prob=1.0):
     """
     Will Set or Add prob and Reward entries to sn_probD and sn_rewardD
     
     DELAY NORMALIZING... Allows sequential adding of multipls action/prob pairs.
        i.e. Merely set the flag "is_normalized" to False 
             to trigger later "normalize" call later.
     """
     prob = float(prob)
     self.sn_probD[next_state_obj] = float(
         prob)  # index=action_obj: value=probability of action
     self.sn_hashD[next_state_obj.hash] = next_state_obj
     self.sn_rewardD[next_state_obj] = reward_obj
     self.is_normalized = False
Пример #10
0
    def setUp(self):
        unittest.TestCase.setUp(self)
        self.TC = TransitionColl()

        actionD = {
            (0, 0): ('D', 'R'),
            (0, 1): ('L', 'R'),
            (0, 2): ('L', 'D', 'R'),
            (1, 0): ('U', 'D'),
            (1, 2): ('U', 'D', 'R'),
            (2, 0): ('U', 'R'),
            (2, 1): ('L', 'R'),
            (2, 2): ('L', 'R', 'U'),
            (2, 3): ('L', 'U')
        }

        rewardD = {(0, 3): 1, (1, 3): -1}

        for state_hash, actionL in actionD.items():
            for action_desc in actionL:

                a = action_desc
                s = state_hash

                if a == 'U':
                    snext_hash = (s[0] - 1, s[1])
                elif a == 'D':
                    snext_hash = (s[0] + 1, s[1])
                elif a == 'R':
                    snext_hash = (s[0], s[1] + 1)
                elif a == 'L':
                    snext_hash = (s[0], s[1] - 1)

                reward_val = rewardD.get(snext_hash, 0.0)

                self.TC.set_transition(s,
                                       a,
                                       snext_hash,
                                       reward_obj=Reward(const=reward_val),
                                       action_prob=1.0,
                                       trans_prob=1.0)
Пример #11
0
    def test_get_list_of_next_state(self):
        """test get list of next state prob"""

        snL = self.TC.get_list_of_all_next_state((0, 0),
                                                 'R',
                                                 incl_zero_prob=False)
        self.assertEqual(len(snL), 1)

        snL = self.TC.get_list_of_all_next_state((0, 0),
                                                 'R',
                                                 incl_zero_prob=True)
        self.assertEqual(len(snL), 1)

        # add another transition
        self.TC.set_transition((0, 0),
                               'R', (0, 3),
                               reward_obj=Reward(const=1.0),
                               action_prob=1.0,
                               trans_prob=1.0)

        snL = self.TC.get_list_of_all_next_state((0, 0),
                                                 'R',
                                                 incl_zero_prob=False)
        self.assertEqual(len(snL), 2)

        snL = self.TC.get_list_of_all_next_state((0, 0),
                                                 'R',
                                                 incl_zero_prob=True)
        self.assertEqual(len(snL), 2)

        # make one transition prob zero
        self.TC.initialize_sole_random((0, 0), 'R')
        snL = self.TC.get_list_of_all_next_state((0, 0),
                                                 'R',
                                                 incl_zero_prob=False)
        self.assertEqual(len(snL), 1)

        snL = self.TC.get_list_of_all_next_state((0, 0),
                                                 'R',
                                                 incl_zero_prob=True)
        self.assertEqual(len(snL), 2)
Пример #12
0
    def test_remove_next_state(self):
        """test remove next state"""
        self.TC.set_transition((0, 0),
                               'R', (0, 3),
                               reward_obj=Reward(const=1.0),
                               action_prob=1.0,
                               trans_prob=1.0)

        Sn1, p1 = self.TC.get_next_state_prob((0, 0), 'R', (0, 1))
        Sn2, p2 = self.TC.get_next_state_prob((0, 0), 'R', (0, 3))
        self.assertEqual(p1, 0.5)
        self.assertEqual(p1, p2)
        self.assertNotEqual(Sn1, Sn2)

        # now remove original transition
        self.TC.remove_next_state((0, 0), 'R', (0, 1))
        Sn1, p1 = self.TC.get_next_state_prob((0, 0), 'R', (0, 1))
        Sn2, p2 = self.TC.get_next_state_prob((0, 0), 'R', (0, 3))
        self.assertEqual(p1, None)
        self.assertNotEqual(p1, p2)
        self.assertNotEqual(Sn1, Sn2)
        self.assertEqual(p2, 1.0)
Пример #13
0
    def test_iter_transitions(self):
        """test iter transitions"""

        # add another transition
        self.TC.set_transition((0, 0),
                               'R', (0, 3),
                               reward_obj=Reward(const=1.0),
                               action_prob=1.0,
                               trans_prob=1.0)

        spL = []
        for (Sn, p) in self.TC.iter_next_state_prob((0, 0),
                                                    'R',
                                                    incl_zero_prob=False):
            spL.append(p)

        self.assertEqual(spL, [0.5, 0.5])

        # make one transition prob zero
        self.TC.initialize_sole_random((0, 0), 'R')

        spL = []
        for (Sn, p) in self.TC.iter_next_state_prob((0, 0),
                                                    'R',
                                                    incl_zero_prob=False):
            spL.append(p)

        self.assertEqual(spL, [1.0])

        spL = []
        for (Sn, p) in self.TC.iter_next_state_prob((0, 0),
                                                    'R',
                                                    incl_zero_prob=True):
            spL.append(p)

        self.assertEqual(sorted(spL), [0.0, 1.0])
Пример #14
0
            for (prob, Sn) in sorted([(prob, Sn)
                                      for (Sn, prob) in self.sn_probD.items()],
                                     reverse=True):
                R = self.sn_rewardD[Sn]
                print('      %9s' % str(Sn.hash), '%6g' % prob,
                      '     %s' % str(R)[1:-1])


if __name__ == "__main__":  # pragma: no cover
    from introrl.action import Action
    from introrl.state import State

    s = State((2, 2))
    a = Action('U')
    T = Transition(s, a)

    rc = Reward(const=1.1)
    reward_probL = [(0.0, 1), (1.0, 1), (2.0, 2)]
    rt = Reward(reward_probL=reward_probL)

    def my_gauss():
        return random.gauss(3.0, 0.5)

    rf = Reward(reward_dist_func=my_gauss)

    T.set_transition(State((2, 3)), reward_obj=rc, prob=0.8)
    T.set_transition(State((1, 2)), reward_obj=rt, prob=0.1)
    T.set_transition(State((3, 2)), reward_obj=rf, prob=0.1)
    T.set_transition(State((0, 0)), reward_obj=rc, prob=0.0)
    T.summ_print()
Пример #15
0
                                a_prob=1.0)  # a_prob will be normalized
                sn = state_hash + action_desc
                self.add_transition(state_hash,
                                    action_desc,
                                    sn,
                                    t_prob=1.0,
                                    reward_obj=0.0)

        self.define_env_states_actions(
        )  # send all states and actions to environment
        self.start_state_hash = 12
        self.layout = GenericLayout(self)


reward_probL = [(0.0, 1), (1.0, 1)]  # will be normalized in use.
rt = Reward(reward_probL=reward_probL)


class TinyEnv(EnvBaseline):
    def __init__(self, name='Tiny Env'):
        EnvBaseline.__init__(self, name=name)

    def define_environment(self):
        for state_hash in range(1, 3):
            for action_desc in [-1, 1]:
                self.add_action(state_hash, action_desc,
                                a_prob=1.0)  # a_prob will be normalized
                sn = state_hash + action_desc
                self.add_transition(state_hash,
                                    action_desc,
                                    sn,
Пример #16
0
 def setUp(self):
     unittest.TestCase.setUp(self)
     self.Rc = Reward( const=1.1 )
     self.Rt = Reward( reward_probL=reward_probL )
     self.Rf = Reward( reward_dist_func=my_gauss )