Python EnvBaseline示例，introrl.environments.env_baseline.EnvBaseline Python示例

示例#1

0

显示文件

文件： six_states.py 项目： sonofeft/IntroRL

def get_six_states():

    env = EnvBaseline(name='Simple Six State World', s_hash_rowL=s_hash_rowL)
    env.set_info('Simple Six State World')

    actionD = {
        'A': ('U', ),
        'B': ('ur', 'D'),
        '<C>': ('ur', 'dl'),
        'D': ('ur', 'ul')
    }

    rewardD = {'A': -1.0, 'E': 0.5, 'F': 1.0}

    for (s_hash, moveL) in actionD.items():
        for a_desc in moveL:
            env.add_action(s_hash, a_desc, a_prob=1.0)

    def add_event(s_hash, a_desc, sn_hash):
        r = rewardD.get(sn_hash, 0.0)
        env.add_transition(s_hash, a_desc, sn_hash, t_prob=1.0, reward_obj=r)

    add_event('A', 'U', 'B')
    #add_event( 'A', 'Te', 'E' )
    add_event('B', 'D', 'A')
    add_event('B', 'ur', '<C>')
    add_event('<C>', 'dl', 'B')
    add_event('<C>', 'ur', 'D')
    add_event('D', 'ur', 'F')
    add_event('D', 'ul', 'E')

    env.define_env_states_actions(
    )  # send all states and actions to environment

    env.start_state_hash = '<C>'

    # define default_policyD
    policyD = {}  # index=state_hash, value=action_desc

    policyD['B'] = 1
    policyD['<C>'] = 1
    policyD['D'] = 1

    env.default_policyD = policyD

    return env

示例#2

0

显示文件

    #get_sim.define_statesD[(20,0)].summ_print()

    #sys.exit() # <-------------------------------------
    #get_sim.collect_transition_data( num_det_calls=10, num_stoic_calls=100 )
    #print('Total recorded actions After:', "{:,}".format( get_sim.total_num_action_data_points() ) )

    #get_sim.save_to_pickle_file( fname )

    #get_sim.summ_print( long=False )
    print('got sim data')
    print('_' * 55)

    #print('CR.s_hash_rowL =', CR.s_hash_rowL)
    env = EnvBaseline(s_hash_rowL=CR.s_hash_rowL,
                      x_axis_label=CR.x_axis_label,
                      y_axis_label=CR.y_axis_label)

    get_sim.add_all_data_to_an_environment(env)

    #env.save_to_pickle_file('car_rental')
    #print('Saved env to *.env_pickle file')

    print('built environment')
    print('_' * 55)

    #env.summ_print()
    policy, state_value = dp_value_iteration(env,
                                             do_summ_print=True,
                                             fmt_V='%.1f',
                                             fmt_R='%.1f',

示例#3

0

显示文件

#import introrl
#print(introrl.__file__)

from introrl.environments.env_baseline import EnvBaseline
grid_world = EnvBaseline(mdp_file='Simple_Grid_World')
grid_world.summ_print()

#car_rental = EnvBaseline( mdp_file='Jacks_Car_Rental_(var_rtn)' )
#car_rental = EnvBaseline( mdp_file='Jacks_Car_Rental_(const_rtn)' )
#car_rental.summ_print()

示例#4

0

显示文件

文件： slippery_cleaning_robot.py 项目： sonofeft/IntroRL

def get_robot(step_reward=-0.04):

    gridworld = EnvBaseline(name='Slipper Cleaning Robot',
                            s_hash_rowL=s_hash_rowL)
    gridworld.set_info("""
        Example taken from "Dissecting Reinforcement Learning-Part 1" 
        Dec 9, 2016   Massimiliano Patacchiola
        https://mpatacchiola.github.io/blog/2016/12/09/dissecting-reinforcement-learning.html
        """)

    def get_right_angle_list(a):

        if a == 'U':
            raL = ['L', 'R']
        elif a == 'D':
            raL = ['L', 'R']
        elif a == 'R':
            raL = ['U', 'D']
        elif a == 'L':
            raL = ['U', 'D']

        return raL

    def get_move_s_next(a, s):

        sn = s
        if a == 'U':
            sn = (s[0] + 1, s[1])
        elif a == 'D':
            sn = (s[0] - 1, s[1])
        elif a == 'R':
            sn = (s[0], s[1] + 1)
        elif a == 'L':
            sn = (s[0], s[1] - 1)

        if sn == (2, 2):  # can't move into block in the middle.
            sn = s

        # limit moves to inside the edges.
        sn_hash = (clamp(sn[0], 1, 3), clamp(sn[1], 1, 4))

        return sn_hash

    non_termL = [(3, 1), (3, 2), (3, 3), (2, 1), (2, 3), (1, 1), (1, 2),
                 (1, 3), (1, 4)]

    rewardD = {(3, 4): 1, (2, 4): -1}

    # put in 80% and both 10% moves to target
    for s_hash in non_termL:
        for a_desc in ['U', 'D', 'L', 'R']:  # normal move
            gridworld.add_action(s_hash, a_desc, a_prob=0.25)

            # 80%
            sn_hash = get_move_s_next(a_desc, s_hash)
            reward_val = rewardD.get(sn_hash, step_reward)

            gridworld.add_transition(s_hash,
                                     a_desc,
                                     sn_hash,
                                     t_prob=0.8,
                                     reward_obj=reward_val)

            # both 10%
            right_angL = get_right_angle_list(a_desc)
            for ar_desc in right_angL:
                sn_hash = get_move_s_next(ar_desc, s_hash)
                reward_val = rewardD.get(sn_hash, step_reward)

                gridworld.add_transition(s_hash,
                                         a_desc,
                                         sn_hash,
                                         t_prob=0.1,
                                         reward_obj=reward_val)
    gridworld.define_env_states_actions()

    # If there is a start state, define it here.
    gridworld.start_state_hash = (1, 1)

    # define default policy (if any)
    policyD = {}  # index=s_hash, value=a_desc

    policyD[(3, 1)] = 'R'
    policyD[(3, 3)] = 'R'
    policyD[(3, 2)] = 'R'

    policyD[(2, 1)] = 'U'
    policyD[(2, 3)] = 'U'

    policyD[(1, 1)] = 'U'
    policyD[(1, 2)] = 'L'
    policyD[(1, 3)] = 'L'
    policyD[(1, 4)] = 'L'

    gridworld.default_policyD = policyD

    return gridworld

示例#5

0

显示文件

文件： car_rental_const_rtn.py 项目： sonofeft/IntroRL

def get_env():

    env = EnvBaseline( name="Jacks Car Rental (const rtn)" ) # GenericLayout set below
    
    simplified_str ="""Shangtong Zhang's simplified model such that
the # of cars returned in daytime becomes constant
rather than a random value from poisson distribution, which will reduce calculation time
and leave the optimal policy/value state matrix almost the same"""    
    
    env.set_info( 'Example 4.2 from Sutton & Barto 2nd Edition page 81.\n' + simplified_str )

    
    # define all possible actions.
    saL = [] # a list of (s1, s2, adesc)
    s_hash_rowL = [] # layout rows for makeing 2D output

    
    for s1 in range( MAX_CARS + 1 ): # 20 cars max
        rowL = [] # row of s_hash_rowL
        
        for s2 in range( MAX_CARS + 1 ): # 20 cars max
            s_hash = (s1, s2)
            rowL.append( s_hash )
            
            for a_desc in range(-5, 6): # -5 moves 5 cars from 2nd to 1st. +5 from 1st to 2nd.
                
                if a_desc < 0: # can only move cars if they are present
                    if (abs(a_desc) <= s2):
                        env.add_action( s_hash, a_desc, a_prob=1.0 )
                        saL.append( (s1, s2, a_desc) )
                else:
                    if (a_desc <= s1): # can only move cars if they are present
                        env.add_action( s_hash, a_desc, a_prob=1.0 )
                        saL.append( (s1, s2, a_desc) )
        
        # use insert to put (0,0) at lower left
        s_hash_rowL.insert(0, rowL )# layout rows for makeing 2D output
    
    # ------------------------------
    # figure out transition probabilities and rewards
    for s1 in range( MAX_CARS + 1 ):
        for s2 in range( MAX_CARS + 1 ):
            for a_desc in range( -5, 6 ):
                get_prob_reward( s1, s2, a_desc)

    # ------------------------------                                                
        
    print('\nStarting to define car rental transitions')
    # with all the probability figured out, define all transitions
    for (s1, s2, a_desc, sn_hash), t_prob in total_probD.items():
        txr = sum_prob_x_rewardD[ (s1, s2, a_desc, sn_hash) ]
        rval = txr / t_prob
        env.add_transition( (s1,s2), a_desc, sn_hash, t_prob=t_prob, reward_obj=rval)
    
        #if s1==10 and s2==10:
        #    print('for (10,10) a_desc=',a_desc,' sn_hash=',sn_hash,'  t_prob=',t_prob,'  rval=',rval)
    
    print('Calling: env.define_env_states_actions')
    env.define_env_states_actions()  # send all states and actions to environment
    print('Environment Ready.')

    # If there is a start state, define it here.
    env.start_state_hash = (10,10)

    # define default policy (if any)
    env.default_policyD = {}


    # --------------------
    # define layout for output

    env.layout = GenericLayout( env, s_hash_rowL=s_hash_rowL, 
                                x_axis_label='#Cars at Second Location',
                                y_axis_label='#Cars at First Location')
    
    return env

示例#6

0

显示文件

def get_gridworld(step_reward=-1, height=7, goal=(3,7),
            windT=(0,0,0,1,1,1,2,2,1,0)):
    """
    Windy Gridworld with (0,0) at lower left
    width is defined by length of windT tuple.
    """

    gridworld = EnvBaseline( name='Windy Kings Gridworld' ) # GenericLayout set below
    gridworld.set_info( """""" )

    width = len( windT )

    def get_action_snext( s_hash, action):
        """returns state_next_hash"""

        di = 0
        dj = 0

        if 'N' in action:
            di = 1
        elif 'S' in action:
            di = -1
            
        if 'E' in action:
            dj = 1
        elif 'W' in action:
            dj = -1

        (i,j) = s_hash
        wind_di = windT[ j ]

        i_next = i + di
        # constrain basic move to be inside the grid
        i_next = max(0, min(height-1, i_next))

        i_next += wind_di # add wind to constrained move.
        j_next = j + dj

        # constrain next position to be inside the grid
        i_next = max(0, min(height-1, i_next))
        j_next = max(0, min(width-1, j_next))

        state_next_hash = (i_next, j_next)
        if state_next_hash == goal:
            state_next_hash = 'Goal'
        return state_next_hash


    # define default policy
    gridworld.default_policyD = {} #index=s_hash, value=list of equiprobable actions

    for i in range(height):
        for j in range(width):
            s_hash = (i,j)
            if s_hash == goal:
                pass  # s_hash == 'Goal'
            else:
                gridworld.default_policyD[ s_hash ] = ('N','S','E','W', 'NE','SE','SW','NW')
                for a_desc in ['N','S','E','W', 'NE','SE','SW','NW']:
                    gridworld.add_action( s_hash, a_desc, a_prob=1.0 ) # a_prob will be normalized

                    sn_hash = get_action_snext( s_hash, a_desc )
                    # add each event to transitions object
                    gridworld.add_transition( s_hash, a_desc, sn_hash, t_prob=1.0, reward_obj=step_reward)

    gridworld.define_env_states_actions()  # send all states and actions to environment

    # --------------------

    s_hash_rowL = [] # layout rows for makeing 2D output
    for i in range(height): # put (0,0) at upper left
        rowL = []
        for j in range(width):
            s_hash = (i,j)
            if s_hash == goal:
                s_hash = 'Goal'

            rowL.append( s_hash )

        # use insert to put (0,0) at lower left, append for upper left
        s_hash_rowL.insert(0, rowL )# layout rows for makeing 2D output

    gridworld.layout = GenericLayout( gridworld, s_hash_rowL=s_hash_rowL,
                                      col_tickL=windT,
                                      x_axis_label='Upward Wind Speed'  )


    gridworld.start_state_hash =  (3,0)

    return gridworld

示例#7

0

显示文件

文件： blackjack_sim.py 项目： sonofeft/IntroRL

    #sys.exit() # <-------------------------------------
    get_sim.collect_transition_data(num_det_calls=10, num_stoic_calls=100)
    print('Total recorded actions After:',
          "{:,}".format(get_sim.total_num_action_data_points()))

    get_sim.save_to_pickle_file(fname)

    #get_sim.summ_print( long=False )
    print('got sim data')
    print('_' * 55)

    #print('BJ.s_hash_rowL =', BJ.s_hash_rowL)

    env = EnvBaseline(s_hash_rowL=BJ.s_hash_rowL,
                      row_tickL=BJ.row_tickL,
                      col_tickL=BJ.col_tickL,
                      x_axis_label=BJ.x_axis_label,
                      y_axis_label=BJ.y_axis_label)

    get_sim.add_all_data_to_an_environment(env)

    print('built environment')
    print('_' * 55)

    #env.summ_print()
    policy, state_value = dp_value_iteration(env,
                                             do_summ_print=True,
                                             fmt_V='%.2f',
                                             fmt_R='%.2f',
                                             max_iter=1000,
                                             err_delta=0.0001,

示例#8

0

显示文件

def get_gridworld(step_reward=0.0):
    gridworld = EnvBaseline(
        name='Simple Grid World')  # GenericLayout set below
    gridworld.set_info('Simple Grid World Example.')

    actionD = {
        (0, 0): ('D', 'R'),
        (0, 1): ('L', 'R'),
        (0, 2): ('L', 'D', 'R'),
        (1, 0): ('U', 'D'),
        (1, 2): ('U', 'D', 'R'),
        (2, 0): ('U', 'R'),
        (2, 1): ('L', 'R'),
        (2, 2): ('L', 'R', 'U'),
        (2, 3): ('L', 'U')
    }

    rewardD = {(0, 3): 1, (1, 3): -1}

    for state_hash, actionL in actionD.items():

        for action_desc in actionL:
            gridworld.add_action(state_hash, action_desc,
                                 a_prob=1.0)  # a_prob will be normalized

            a = action_desc
            s = state_hash

            if a == 'U':
                state_next_hash = (s[0] - 1, s[1])
            elif a == 'D':
                state_next_hash = (s[0] + 1, s[1])
            elif a == 'R':
                state_next_hash = (s[0], s[1] + 1)
            elif a == 'L':
                state_next_hash = (s[0], s[1] - 1)

            reward_val = rewardD.get(state_next_hash, step_reward)

            gridworld.add_transition(state_hash,
                                     action_desc,
                                     state_next_hash,
                                     t_prob=1.0,
                                     reward_obj=reward_val)

    gridworld.define_env_states_actions(
    )  # send all states and actions to environment

    gridworld.layout = GenericLayout(
        gridworld)  # uses default "get_layout_row_col_of_state"

    # If there is a start state, define it here.
    gridworld.start_state_hash = (2, 0)
    gridworld.define_limited_start_state_list([(2, 0), (2, 2)])

    # define default policy (if any)
    # Policy Dictionary for: GridWorld

    policyD = {}  # index=state_hash, value=action_desc

    #                 Vpi shown for gamma=0.9
    policyD[(0, 0)] = 'R'  # Vpi=0.81
    policyD[(1, 0)] = 'U'  # Vpi=0.729
    policyD[(0, 1)] = 'R'  # Vpi=0.9
    policyD[(0, 2)] = 'R'  # Vpi=1.0
    policyD[(1, 2)] = 'U'  # Vpi=0.9
    policyD[(2, 0)] = 'U'  # Vpi=0.6561
    policyD[(2, 2)] = 'U'  # Vpi=0.81
    policyD[(2, 1)] = 'R'  # Vpi=0.729
    policyD[(2, 3)] = 'L'  # Vpi=0.729

    gridworld.default_policyD = policyD

    return gridworld

示例#9

0

显示文件

文件： test_env.py 项目： sonofeft/IntroRL

 def __init__(self, name='Tiny Env'):
     EnvBaseline.__init__(self, name=name)

示例#10

0

显示文件

文件： test_env.py 项目： sonofeft/IntroRL

 def __init__(self, name='Dummy Env'):
     EnvBaseline.__init__(self, name=name)

示例#11

0

显示文件

文件： gamblers_problem.py 项目： sonofeft/IntroRL

def get_gambler(prob_heads=0.4):

    gambler = EnvBaseline( name='Gamblers Coin Flip Problem',
                           s_hash_rowL=s_hash_rowL,
                           colorD={100:'g', 0:'r'},
                           basic_color='skyblue' )
    gambler.set_info( 'Example 4.3 from Sutton & Barto 2nd Edition page 84.' )

    for s in range(1, 100): # 1 to 99
        s_max = min(s, 100-s)
        for a_desc in range(1, s_max + 1):
            gambler.add_action( s, a_desc, a_prob=1.0 )

    # define reward for all states
    def get_reward( sn ):
        if sn==100:
            return 1.0
        else:
            return 0.0

    # define all possible transitions.
    for s in range(1, 100): # 1 to 99
        s_max = min(s, 100-s)
        for a_desc in range(1, s_max + 1):
            sn_hash = s - a_desc
            rval = get_reward( sn_hash )
            gambler.add_transition( s, a_desc, sn_hash, t_prob=1.0-prob_heads, reward_obj=rval)

            sn_hash = s + a_desc
            rval = get_reward( sn_hash )
            gambler.add_transition( s, a_desc, sn_hash, t_prob=prob_heads, reward_obj=rval)
            
    gambler.define_env_states_actions()  # send all states and actions to environment

    # If there is a start state, define it here.
    gambler.start_state_hash = (50)

    # define default policy (if any)
    gambler.default_policyD = {}
    
    return gambler

示例#12

0

显示文件

def get_robot():

    robot = EnvBaseline(name='Slow-Fast Fallen Robot', s_hash_rowL=s_hash_rowL)
    robot.set_info("""
        Sample 3 State Fallen, Standing, Moving Robot.
        https://sandipanweb.wordpress.com/2017/03/23/some-reinforcement-learning-using-policy-value-iteration-and-q-learning-for-a-markov-decision-process-in-python-and-r/
        Some Reinforcement Learning: Using Policy & Value Iteration and Q-learning for a Markov Decision Process in Python and R
        """)

    robot.add_action('Fallen', 'Slow', a_prob=1.0)
    robot.add_action('Standing', 'Slow', a_prob=1.0)
    robot.add_action('Moving', 'Slow', a_prob=1.0)

    robot.add_action('Standing', 'Fast', a_prob=1.0)
    robot.add_action('Moving', 'Fast', a_prob=1.0)

    robot.add_transition('Fallen',
                         'Slow',
                         'Fallen',
                         t_prob=0.6,
                         reward_obj=-1.0)
    robot.add_transition('Fallen',
                         'Slow',
                         'Standing',
                         t_prob=0.4,
                         reward_obj=1.0)

    robot.add_transition('Standing',
                         'Slow',
                         'Moving',
                         t_prob=1.0,
                         reward_obj=1.0)
    robot.add_transition('Moving',
                         'Slow',
                         'Moving',
                         t_prob=1.0,
                         reward_obj=1.0)

    robot.add_transition('Standing',
                         'Fast',
                         'Moving',
                         t_prob=0.6,
                         reward_obj=2.0)
    robot.add_transition('Standing',
                         'Fast',
                         'Fallen',
                         t_prob=0.4,
                         reward_obj=-1.0)

    robot.add_transition('Moving',
                         'Fast',
                         'Moving',
                         t_prob=0.8,
                         reward_obj=2.0)
    robot.add_transition('Moving',
                         'Fast',
                         'Fallen',
                         t_prob=0.2,
                         reward_obj=-1.0)

    robot.define_env_states_actions(
    )  # send all states and actions to environment

    robot.start_state_hash = 'Standing'

    # define default policy (if any)
    policyD = {}  # index=state_hash, value=action_desc

    policyD['Standing'] = 'Slow'
    policyD['Fallen'] = 'Slow'
    policyD['Moving'] = 'Slow'
    robot.default_policyD = policyD

    return robot

示例#13

0

显示文件

def get_gridworld(
    step_reward=0.0,
    N_mult=1,  # N_mult must be an integer.
    width=9,
    height=6,
    goal=(0, 8),
    start=(2, 0),
    wallL=((1, 2), (2, 2), (3, 2), (0, 7), (1, 7), (2, 7), (4, 5))):

    gridworld = EnvBaseline(
        name='Sutton Ex8.4 Priority Sweep Maze')  # GenericLayout set below
    gridworld.set_info("""Sutton Ex8.1 Dyna Maze""")

    width_big = width * N_mult
    height_big = height * N_mult

    gridworld.characteristic_dim = width_big + height_big * 2
    # get relaxed optimal length from Zhang.
    gridworld.optimal_path_len = int(14 * N_mult * 1.2) + 1

    def get_action_snext_reward(s_hash, action):
        """returns reward and state_next_hash"""

        di = 0
        dj = 0
        reward = 0

        if action == 'U':
            di = -1
        elif action == 'D':
            di = 1
        elif action == 'R':
            dj = 1
        elif action == 'L':
            dj = -1

        (i, j) = s_hash
        i_next = i + di
        j_next = j + dj

        if j_next >= width_big:
            j_next = j
        elif j_next < 0:
            j_next = j

        if i_next >= height_big:
            i_next = i
        elif i_next < 0:
            i_next = i

        if (i_next, j_next) in wall_set:
            i_next, j_next = i, j

        state_next_hash = (i_next, j_next)

        if state_next_hash in goal_set:
            reward = 1.0
        else:
            reward = 0.0

        return reward, state_next_hash

    def make_big_set(pos):
        """Take an (i,j) position, pos, and expand to new, big size in x and y"""
        pos_set = set()
        ip, jp = pos
        ip *= N_mult
        jp *= N_mult
        for ixn in range(N_mult):
            for jxn in range(N_mult):
                pos_set.add((ip + ixn, jp + jxn))
        return pos_set

    # define default policy
    gridworld.default_policyD = {
    }  #index=s_hash, value=list of equiprobable actions

    # redefine start
    istart, jstart = start
    start = (istart * N_mult, jstart * N_mult)

    # make goal set
    goal_set = make_big_set(goal)

    # make wall set
    wall_set = set()
    for wall in wallL:
        wall_set.update(make_big_set(wall))

    # create state hash entries
    for i in range(height_big):
        for j in range(width_big):
            s_hash = (i, j)
            if (s_hash not in wall_set) and (s_hash not in goal_set):
                gridworld.default_policyD[s_hash] = ('U', 'D', 'R', 'L')
                for a_desc in ['U', 'D', 'R', 'L']:
                    gridworld.add_action(
                        s_hash, a_desc,
                        a_prob=1.0)  # a_prob will be normalized

                    reward_val, sn_hash = get_action_snext_reward(
                        s_hash, a_desc)
                    # add each event to transitions object
                    gridworld.add_transition(s_hash,
                                             a_desc,
                                             sn_hash,
                                             t_prob=1.0,
                                             reward_obj=reward_val)

    gridworld.define_env_states_actions(
    )  # send all states and actions to environment

    # --------------------

    s_hash_rowL = []  # layout rows for makeing 2D output
    for i in range(height_big):  # put (0,0) at upper left
        rowL = []
        for j in range(width_big):
            s = (i, j)
            if s in wall_set:
                rowL.append('"Wall"')
            else:
                rowL.append(s)

        # use insert to put (0,0) at lower left
        s_hash_rowL.append(rowL)  # layout rows for makeing 2D output

    named_s_hashD = {}
    named_s_hashD[start] = 'Start'
    for g in goal_set:
        named_s_hashD[g] = 'Goal'

    gridworld.layout = GenericLayout(gridworld,
                                     s_hash_rowL=s_hash_rowL,
                                     named_s_hashD=named_s_hashD)

    gridworld.start_state_hash = start

    return gridworld

示例#14

0

显示文件

文件： sutton_ex4_1_grid.py 项目： sonofeft/IntroRL

def get_gridworld(step_reward=-0.04):

    gridworld = EnvBaseline(name='Sutton Ex4.1 Grid World',
                            s_hash_rowL=s_hash_rowL)
    gridworld.set_info("""
        Example 4.1 grid 
        Label for blank space is "0" (both blanks are the same actual state)
        (i.e. upper left corner and lower right corner are state "0")
        """)

    for state_hash in range(1, 15):  # states are numbered 1-14
        for action_desc in ['U', 'D', 'R', 'L']:
            gridworld.add_action(state_hash, action_desc,
                                 a_prob=1.0)  # a_prob will be normalized

            a = action_desc
            s = state_hash

            if a == 'U':
                sn = s - 4
            elif a == 'D':
                sn = s + 4
            elif a == 'R':
                if s not in [3, 7, 11]:
                    sn = s + 1
                else:
                    sn = s
            elif a == 'L':
                if s not in [4, 8, 12]:
                    sn = s - 1
                else:
                    sn = s

            if sn < 0:
                sn = s
            elif sn > 15:
                sn = s
            elif sn == 15:
                sn = 0

            gridworld.add_transition(state_hash,
                                     action_desc,
                                     sn,
                                     t_prob=1.0,
                                     reward_obj=-1.0)

    gridworld.define_env_states_actions(
    )  # send all states and actions to environment

    gridworld.start_state_hash = 12

    # define default policy (if any)
    policyD = {}  # index=state_hash, value=action_desc

    for (s_hash, a_desc) in gridworld.iter_state_hash_action_desc():
        if s_hash not in policyD:
            policyD[s_hash] = []
        policyD[s_hash].append((a_desc, 0.25))

    # make policyD entries hashable for later use (i.e. tuple, not list)
    for s_hash, aL in policyD.items():
        policyD[s_hash] = tuple(aL)

    gridworld.default_policyD = policyD

    return gridworld

示例#15

0

显示文件

    def __init__(self, name='Layout Check'):

        EnvBaseline.__init__(self, name=name)

示例#16

0

显示文件

文件： dp_blackjack.py 项目： sonofeft/IntroRL

BJ = BlackJackSimulation()
get_sim = Model(BJ, build_initial_model=True)

get_sim.collect_transition_data(num_det_calls=50, num_stoic_calls=100000)

BJ.layout.s_hash_print()

get_sim.num_calls_layout_print()
get_sim.min_num_calls_layout_print()

print('got sim data')
print('_' * 55)

env = EnvBaseline(s_hash_rowL=BJ.s_hash_rowL,
                  x_axis_label=BJ.x_axis_label,
                  y_axis_label=BJ.y_axis_label)

get_sim.add_all_data_to_an_environment(env)

print('built environment')
print('_' * 55)

#env.summ_print()
policy, state_value = dp_value_iteration(env,
                                         do_summ_print=True,
                                         fmt_V='%.1f',
                                         fmt_R='%.1f',
                                         max_iter=1000,
                                         err_delta=0.0001,
                                         gamma=0.9,

示例#17

0

显示文件

文件： define_state_moves.py 项目： sonofeft/IntroRL

                '    NOTE: Sum of Action Probability=%g  WILL BE NORMALIZED TO 1.0'
                % sum_a_prob)


if __name__ == "__main__":  # pragma: no cover

    from introrl.environments.env_baseline import EnvBaseline

    IO = DefineStateMoves('State_1')
    #IO.summ_print()

    #print('_'*55)
    IO.add_action('U', .5)
    IO.add_action('D', .51)
    #IO.summ_print()

    #print('_'*55)
    IO.add_transition('U', (2, 2), 0.2, 0.0)
    IO.add_transition('U', (2, 0), 0.79, 1.0)
    #IO.summ_print()

    #print('_'*55)
    IO.add_transition('D', (2, 0), 1.0, 1.0)
    IO.summ_print()

    print('_' * 55)

    env = EnvBaseline()
    IO.add_to_environment(env)
    env.summ_print()

示例#18

0

显示文件

    if not get_sim.read_pickle_file(fname):
        get_sim.collect_transition_data(num_det_calls=10, num_stoic_calls=1000)

    print('Total recorded actions Before:',
          "{:,}".format(get_sim.total_num_action_data_points()))
    get_sim.collect_transition_data(num_det_calls=10, num_stoic_calls=100)
    print('Total recorded actions After:',
          "{:,}".format(get_sim.total_num_action_data_points()))

    get_sim.save_to_pickle_file(fname)

    #get_sim.summ_print( long=False )
    print('got sim data')
    print('_' * 55)

    env = EnvBaseline(s_hash_rowL=CR.s_hash_rowL)
    get_sim.add_all_data_to_an_environment(env)

    #env.layout = GenericLayout( env )

    print('built environment')
    print('_' * 55)

    #env.summ_print()
    policy, state_value = dp_value_iteration(env,
                                             do_summ_print=True,
                                             fmt_V='%.1f',
                                             fmt_R='%.1f',
                                             max_iter=1000,
                                             err_delta=0.0001,
                                             gamma=0.9,

示例#19

0

显示文件

文件： sample_gridworld.py 项目： sonofeft/IntroRL

def get_gridworld():
    gridworld = EnvBaseline(name='Sample Grid World',
                            s_hash_rowL=s_hash_rowL,
                            row_tickL=row_tickL,
                            x_axis_label=x_axis_label,
                            col_tickL=col_tickL,
                            y_axis_label=y_axis_label,
                            colorD={
                                'Goal': 'g',
                                'Pit': 'r',
                                'Start': 'b'
                            },
                            basic_color='skyblue')

    gridworld.set_info('Sample Grid World showing basic MDP creation.')

    # add actions from each state
    #   (note: a_prob will be normalized within add_action_dict)
    gridworld.add_action_dict(actionD)

    # for each action, define the next state and transition probability
    # (here we use the layout definition to aid the logic)
    for s_hash, aL in actionD.items():
        for a_desc in aL:
            sn_hash = get_next_state(s_hash, a_desc)
            reward = rewardD.get(sn_hash, 0.0)

            # for deterministic MDP, use t_prob=1.0
            gridworld.add_transition(s_hash,
                                     a_desc,
                                     sn_hash,
                                     t_prob=1.0,
                                     reward_obj=reward)

    # after the "add" commands, send all states and actions to environment
    # (any required normalization is done here as well.)
    gridworld.define_env_states_actions()

    # If there is a start state, define it here.
    gridworld.start_state_hash = 'Start'

    # If a limited number of start states are desired, define them here.
    gridworld.define_limited_start_state_list([(2, 0), (2, 2)])

    # if a default policy is desired, define it as a dict.
    gridworld.default_policyD = {
        (0, 0): 'R',
        (1, 0): 'U',
        (0, 1): 'R',
        (0, 2): 'R',
        (1, 2): 'U',
        'Start': 'U',
        (2, 2): 'U',
        (2, 1): 'R',
        (2, 3): 'L'
    }

    return gridworld

示例#20

0

显示文件

文件： random_walk_mrp.py 项目： sonofeft/IntroRL

def get_random_walk():

    env = EnvBaseline(name='Random Walk MRP')  # GenericLayout set below
    env.set_info('Random Walk MRP')

    actionD = {
        'A': ('L', 'R'),
        'B': ('L', 'R'),
        'C': ('L', 'R'),
        'D': ('L', 'R'),
        'E': ('L', 'R')
    }

    rewardD = {'Win': 1.0, 'Lose': 0.0}

    for (s_hash, moveL) in actionD.items():
        for a_desc in moveL:
            env.add_action(s_hash, a_desc, a_prob=1.0)

    def add_event(s_hash, a_desc, sn_hash):
        #print('s_hash, a_desc, sn_hash',s_hash, a_desc, sn_hash)
        r = rewardD.get(sn_hash, 0.0)
        env.add_transition(s_hash, a_desc, sn_hash, t_prob=1.0, reward_obj=r)

    mrpL = ['Lose', 'A', 'B', 'C', 'D', 'E', 'Win']
    for i, ci in enumerate(mrpL[1:-1]):
        add_event(ci, 'L', mrpL[i])
        add_event(ci, 'R', mrpL[i + 2])

    env.define_env_states_actions(
    )  # send all states and actions to environment

    # --------------------

    s_hash_rowL = [mrpL]

    env.layout = GenericLayout(env, s_hash_rowL=s_hash_rowL)

    env.start_state_hash = 'C'

    # define default_policyD
    policyD = {}  # index=state_hash, value=action_desc

    policyD['A'] = ('L', 'R')
    policyD['B'] = ('L', 'R')
    policyD['C'] = ('L', 'R')
    policyD['D'] = ('L', 'R')
    policyD['E'] = ('L', 'R')

    env.default_policyD = policyD

    return env

示例#21

0

显示文件

文件： random_walk_1000.py 项目： sonofeft/IntroRL

    
    RW = RandomWalk_1000Simulation()
    #RW.layout.s_hash_print( none_str='*' )
    
    
    get_sim = Model( RW, build_initial_model=True )

    get_sim.collect_transition_data( num_det_calls=100, num_stoic_calls=10000 )

    RW.layout.s_hash_print()

    #get_sim.num_calls_layout_print()
    #get_sim.min_num_calls_layout_print()
    
    env = EnvBaseline( s_hash_rowL=RW.s_hash_rowL, 
                       x_axis_label=RW.x_axis_label, 
                       y_axis_label=RW.y_axis_label )
                       
    get_sim.add_all_data_to_an_environment( env )

    policy, state_value = dp_value_iteration( env, do_summ_print=True, fmt_V='%.3f', fmt_R='%.1f',
                                              max_iter=1000, err_delta=0.0001, 
                                              gamma=0.9, iteration_prints=10)
                                  
    policy.save_diagram( RW, inp_colorD=None, save_name='dp_rw1000_policy',
                         show_arrows=False, scale=0.5, h_over_w=0.8,
                         show_terminal_labels=False)

    print( 'Total Time =',time.time() - start_time )

    pickle_esp.save_to_pickle_file( fname='dp_soln_to_randwalk_1000',

示例#22

0

显示文件

def get_gridworld(step_reward=0.0,
                  width=9,
                  height=6,
                  goal=(0, 8),
                  start=(2, 0),
                  wallL=((1, 2), (2, 2), (3, 2), (0, 7), (1, 7), (2, 7), (4,
                                                                          5))):

    gridworld = EnvBaseline(
        name='Sutton Ex8.1 Dyna Maze')  # GenericLayout set below
    gridworld.set_info("""Sutton Ex8.1 Dyna Maze""")

    def get_action_snext_reward(s_hash, action):
        """returns reward and state_next_hash"""

        di = 0
        dj = 0
        reward = 0

        if action == 'U':
            di = -1
        elif action == 'D':
            di = 1
        elif action == 'R':
            dj = 1
        elif action == 'L':
            dj = -1

        (i, j) = s_hash
        i_next = i + di
        j_next = j + dj

        if j_next >= width:
            j_next = j
        elif j_next < 0:
            j_next = j

        if i_next >= height:
            i_next = i
        elif i_next < 0:
            i_next = i

        if (i_next, j_next) in wallL:
            i_next, j_next = i, j

        state_next_hash = (i_next, j_next)

        if state_next_hash == goal:
            reward = 1.0
        else:
            reward = 0.0

        return reward, state_next_hash

    # define default policy
    gridworld.default_policyD = {
    }  #index=s_hash, value=list of equiprobable actions

    for i in range(height):
        for j in range(width):
            s_hash = (i, j)
            if s_hash != goal:
                gridworld.default_policyD[s_hash] = ('U', 'D', 'R', 'L')
                for a_desc in ['U', 'D', 'R', 'L']:
                    gridworld.add_action(
                        s_hash, a_desc,
                        a_prob=1.0)  # a_prob will be normalized

                    reward_val, sn_hash = get_action_snext_reward(
                        s_hash, a_desc)
                    # add each event to transitions object
                    gridworld.add_transition(s_hash,
                                             a_desc,
                                             sn_hash,
                                             t_prob=1.0,
                                             reward_obj=reward_val)

    gridworld.define_env_states_actions(
    )  # send all states and actions to environment

    # --------------------

    s_hash_rowL = []  # layout rows for makeing 2D output
    for i in range(height):  # put (0,0) at upper left
        rowL = []
        for j in range(width):
            s = (i, j)
            if s in wallL:
                rowL.append('"Wall"')
            else:
                rowL.append(s)

        # use insert to put (0,0) at lower left
        s_hash_rowL.append(rowL)  # layout rows for makeing 2D output

    named_s_hashD = {start: 'Start', goal: 'Goal'}
    gridworld.layout = GenericLayout(gridworld,
                                     s_hash_rowL=s_hash_rowL,
                                     named_s_hashD=named_s_hashD)

    gridworld.start_state_hash = start

    return gridworld

示例#23

0

显示文件

文件： sutton_5x5_gridworld.py 项目： sonofeft/IntroRL

def get_gridworld(step_reward=-0.04):

    gridworld = EnvBaseline(name='Sutton Ex4.1 5x5 Grid World',
                            s_hash_rowL=s_hash_rowL)
    gridworld.set_info("""
           Sutton 5x5 Gridworld
        Book Answer from page 65 (linear eqn solve) for gamma=0.9
         22.0     24.4      22.0      19.4      17.5
         19.8     22.0      19.8      17.8      16.0
         17.8     19.8      17.8      16.0      14.4
         16.0     17.8      16.0      14.4      13.0
         14.4     16.0      14.4      13.0      11.7
    =================================================    """)

    def get_action_snext_reward(s_hash, action):
        """returns reward and state_next_hash"""

        di = 0
        dj = 0
        reward = 0

        if action == 'N':
            di = 1
        elif action == 'S':
            di = -1
        elif action == 'E':
            dj = 1
        elif action == 'W':
            dj = -1

        (i, j) = s_hash
        i_next = i + di
        j_next = j + dj

        if (i == 4) and (j == 1):
            i_next = 0
            j_next = 1
            reward = 10
        elif (i == 4) and (j == 3):
            i_next = 2
            j_next = 3
            reward = 5
        elif (i_next < 0) or (i_next > 4) or (j_next < 0) or (j_next > 4):
            i_next = i
            j_next = j
            reward = -1

        state_next_hash = (i_next, j_next)
        return reward, state_next_hash

    # define default policy
    gridworld.default_policyD = {
    }  #index=s_hash, value=list of equiprobable actions

    for i in range(5):
        for j in range(5):
            s_hash = (i, j)

            gridworld.default_policyD[s_hash] = ('N', 'S', 'E', 'W')
            for a_desc in ['N', 'S', 'E', 'W']:
                gridworld.add_action(s_hash, a_desc,
                                     a_prob=1.0)  # a_prob will be normalized

                reward_val, sn_hash = get_action_snext_reward(s_hash, a_desc)
                # add each event to transitions object
                gridworld.add_transition(s_hash,
                                         a_desc,
                                         sn_hash,
                                         t_prob=1.0,
                                         reward_obj=reward_val)

    gridworld.define_env_states_actions(
    )  # send all states and actions to environment

    gridworld.start_state_hash = (0, 0)

    return gridworld

示例#24

0

显示文件

def get_random_walk(Nside_states=9,
                    win_reward=1.0,
                    lose_reward=-1.0,
                    step_reward=0.0):

    Nstates = 2 * Nside_states + 1

    s = '(L%i, R%i)' % (Nside_states, Nside_states)
    env = EnvBaseline(name='%i State Random Walk MRP' % Nstates +
                      s)  # GenericLayout set below
    env.set_info('%i State Random Walk MRP' % Nstates + s)

    RstateL = ['R+%i' % i for i in range(1, Nside_states + 1)]
    LstateL = list(reversed([s.replace('R+', 'L-') for s in RstateL]))

    actionD = {}
    for s in LstateL:
        actionD[s] = ('L', 'R')
    actionD['C'] = ('L', 'R')
    for s in RstateL:
        actionD[s] = ('L', 'R')

    rewardD = {'Win': win_reward, 'Lose': lose_reward}

    for (s_hash, moveL) in actionD.items():
        for a_desc in moveL:
            env.add_action(s_hash, a_desc, a_prob=1.0)

    def add_event(s_hash, a_desc, sn_hash):
        #print('s_hash, a_desc, sn_hash',s_hash, a_desc, sn_hash)
        r = rewardD.get(sn_hash, step_reward)
        env.add_transition(s_hash, a_desc, sn_hash, t_prob=1.0, reward_obj=r)

    mrpL = ['Lose'] + LstateL + ['C'] + RstateL + ['Win']
    for i, ci in enumerate(mrpL[1:-1]):
        add_event(ci, 'L', mrpL[i])
        add_event(ci, 'R', mrpL[i + 2])

    env.define_env_states_actions(
    )  # send all states and actions to environment

    # -------------------- make layout for printing ------------------

    s_hash_rowL = [mrpL]

    env.layout = GenericLayout(env, s_hash_rowL=s_hash_rowL)

    env.start_state_hash = 'C'

    # define default_policyD
    policyD = {}  # index=state_hash, value=action_desc

    policyD['C'] = ('L', 'R')
    for s in LstateL:
        policyD[s] = ('L', 'R')
    for s in RstateL:
        policyD[s] = ('L', 'R')

    env.default_policyD = policyD

    return env