def get_env(): env = EnvBaseline(name='Simple Six State World') env.set_info('Simple Six State World') actionD = { 'A': ('U', ), 'B': ('ur', 'D'), '<C>': ('ur', 'dl'), 'D': ('ur', 'ul') } rewardD = {'A': -1.0, 'E': 0.5, 'F': 1.0} for (s_hash, moveL) in actionD.items(): for a_desc in moveL: env.add_action(s_hash, a_desc, a_prob=1.0) def add_event(s_hash, a_desc, sn_hash): r = rewardD.get(sn_hash, 0.0) env.add_transition(s_hash, a_desc, sn_hash, t_prob=1.0, reward_obj=r) add_event('A', 'U', 'B') #add_event( 'A', 'Te', 'E' ) add_event('B', 'D', 'A') add_event('B', 'ur', '<C>') add_event('<C>', 'dl', 'B') add_event('<C>', 'ur', 'D') add_event('D', 'ur', 'F') add_event('D', 'ul', 'E') env.define_env_states_actions( ) # send all states and actions to environment # -------------------- s_hash_rowL = [('*', 'E', '*', 'F'), ('*', '*', 'D', '*'), ('*', '<C>', '*', '*'), ('B', '*', '*', '*'), ('A', '*', '*', '*')] env.layout = GenericLayout(env, s_hash_rowL=s_hash_rowL) env.start_state_hash = '<C>' # define default_policyD policyD = {} # index=state_hash, value=action_desc policyD['B'] = 1 policyD['<C>'] = 1 policyD['D'] = 1 env.default_policyD = policyD return env
def get_random_walk(): env = EnvBaseline(name='Random Walk MRP') # GenericLayout set below env.set_info('Random Walk MRP') actionD = { 'A': ('L', 'R'), 'B': ('L', 'R'), 'C': ('L', 'R'), 'D': ('L', 'R'), 'E': ('L', 'R') } rewardD = {'Win': 1.0, 'Lose': 0.0} for (s_hash, moveL) in actionD.items(): for a_desc in moveL: env.add_action(s_hash, a_desc, a_prob=1.0) def add_event(s_hash, a_desc, sn_hash): #print('s_hash, a_desc, sn_hash',s_hash, a_desc, sn_hash) r = rewardD.get(sn_hash, 0.0) env.add_transition(s_hash, a_desc, sn_hash, t_prob=1.0, reward_obj=r) mrpL = ['Lose', 'A', 'B', 'C', 'D', 'E', 'Win'] for i, ci in enumerate(mrpL[1:-1]): add_event(ci, 'L', mrpL[i]) add_event(ci, 'R', mrpL[i + 2]) env.define_env_states_actions( ) # send all states and actions to environment # -------------------- s_hash_rowL = [mrpL] env.layout = GenericLayout(env, s_hash_rowL=s_hash_rowL) env.start_state_hash = 'C' # define default_policyD policyD = {} # index=state_hash, value=action_desc policyD['A'] = ('L', 'R') policyD['B'] = ('L', 'R') policyD['C'] = ('L', 'R') policyD['D'] = ('L', 'R') policyD['E'] = ('L', 'R') env.default_policyD = policyD return env
def get_gridworld(step_reward=-1, height=7, goal=(3,7), windT=(0,0,0,1,1,1,2,2,1,0)): """ Windy Gridworld with (0,0) at lower left width is defined by length of windT tuple. """ gridworld = EnvBaseline( name='Windy Kings Gridworld' ) # GenericLayout set below gridworld.set_info( """""" ) width = len( windT ) def get_action_snext( s_hash, action): """returns state_next_hash""" di = 0 dj = 0 if 'N' in action: di = 1 elif 'S' in action: di = -1 if 'E' in action: dj = 1 elif 'W' in action: dj = -1 (i,j) = s_hash wind_di = windT[ j ] i_next = i + di # constrain basic move to be inside the grid i_next = max(0, min(height-1, i_next)) i_next += wind_di # add wind to constrained move. j_next = j + dj # constrain next position to be inside the grid i_next = max(0, min(height-1, i_next)) j_next = max(0, min(width-1, j_next)) state_next_hash = (i_next, j_next) if state_next_hash == goal: state_next_hash = 'Goal' return state_next_hash # define default policy gridworld.default_policyD = {} #index=s_hash, value=list of equiprobable actions for i in range(height): for j in range(width): s_hash = (i,j) if s_hash == goal: pass # s_hash == 'Goal' else: gridworld.default_policyD[ s_hash ] = ('N','S','E','W', 'NE','SE','SW','NW') for a_desc in ['N','S','E','W', 'NE','SE','SW','NW']: gridworld.add_action( s_hash, a_desc, a_prob=1.0 ) # a_prob will be normalized sn_hash = get_action_snext( s_hash, a_desc ) # add each event to transitions object gridworld.add_transition( s_hash, a_desc, sn_hash, t_prob=1.0, reward_obj=step_reward) gridworld.define_env_states_actions() # send all states and actions to environment # -------------------- s_hash_rowL = [] # layout rows for makeing 2D output for i in range(height): # put (0,0) at upper left rowL = [] for j in range(width): s_hash = (i,j) if s_hash == goal: s_hash = 'Goal' rowL.append( s_hash ) # use insert to put (0,0) at lower left, append for upper left s_hash_rowL.insert(0, rowL )# layout rows for makeing 2D output gridworld.layout = GenericLayout( gridworld, s_hash_rowL=s_hash_rowL, col_tickL=windT, x_axis_label='Upward Wind Speed' ) gridworld.start_state_hash = (3,0) return gridworld
def get_gridworld(step_reward=0.0): gridworld = EnvBaseline( name='Simple Grid World') # GenericLayout set below gridworld.set_info('Simple Grid World Example.') actionD = { (0, 0): ('D', 'R'), (0, 1): ('L', 'R'), (0, 2): ('L', 'D', 'R'), (1, 0): ('U', 'D'), (1, 2): ('U', 'D', 'R'), (2, 0): ('U', 'R'), (2, 1): ('L', 'R'), (2, 2): ('L', 'R', 'U'), (2, 3): ('L', 'U') } rewardD = {(0, 3): 1, (1, 3): -1} for state_hash, actionL in actionD.items(): for action_desc in actionL: gridworld.add_action(state_hash, action_desc, a_prob=1.0) # a_prob will be normalized a = action_desc s = state_hash if a == 'U': state_next_hash = (s[0] - 1, s[1]) elif a == 'D': state_next_hash = (s[0] + 1, s[1]) elif a == 'R': state_next_hash = (s[0], s[1] + 1) elif a == 'L': state_next_hash = (s[0], s[1] - 1) reward_val = rewardD.get(state_next_hash, step_reward) gridworld.add_transition(state_hash, action_desc, state_next_hash, t_prob=1.0, reward_obj=reward_val) gridworld.define_env_states_actions( ) # send all states and actions to environment gridworld.layout = GenericLayout( gridworld) # uses default "get_layout_row_col_of_state" # If there is a start state, define it here. gridworld.start_state_hash = (2, 0) gridworld.define_limited_start_state_list([(2, 0), (2, 2)]) # define default policy (if any) # Policy Dictionary for: GridWorld policyD = {} # index=state_hash, value=action_desc # Vpi shown for gamma=0.9 policyD[(0, 0)] = 'R' # Vpi=0.81 policyD[(1, 0)] = 'U' # Vpi=0.729 policyD[(0, 1)] = 'R' # Vpi=0.9 policyD[(0, 2)] = 'R' # Vpi=1.0 policyD[(1, 2)] = 'U' # Vpi=0.9 policyD[(2, 0)] = 'U' # Vpi=0.6561 policyD[(2, 2)] = 'U' # Vpi=0.81 policyD[(2, 1)] = 'R' # Vpi=0.729 policyD[(2, 3)] = 'L' # Vpi=0.729 gridworld.default_policyD = policyD return gridworld
def get_gridworld(step_reward=0.0, width=9, height=6, goal=(0, 8), start=(2, 0), wallL=((1, 2), (2, 2), (3, 2), (0, 7), (1, 7), (2, 7), (4, 5))): gridworld = EnvBaseline( name='Sutton Ex8.1 Dyna Maze') # GenericLayout set below gridworld.set_info("""Sutton Ex8.1 Dyna Maze""") def get_action_snext_reward(s_hash, action): """returns reward and state_next_hash""" di = 0 dj = 0 reward = 0 if action == 'U': di = -1 elif action == 'D': di = 1 elif action == 'R': dj = 1 elif action == 'L': dj = -1 (i, j) = s_hash i_next = i + di j_next = j + dj if j_next >= width: j_next = j elif j_next < 0: j_next = j if i_next >= height: i_next = i elif i_next < 0: i_next = i if (i_next, j_next) in wallL: i_next, j_next = i, j state_next_hash = (i_next, j_next) if state_next_hash == goal: reward = 1.0 else: reward = 0.0 return reward, state_next_hash # define default policy gridworld.default_policyD = { } #index=s_hash, value=list of equiprobable actions for i in range(height): for j in range(width): s_hash = (i, j) if s_hash != goal: gridworld.default_policyD[s_hash] = ('U', 'D', 'R', 'L') for a_desc in ['U', 'D', 'R', 'L']: gridworld.add_action( s_hash, a_desc, a_prob=1.0) # a_prob will be normalized reward_val, sn_hash = get_action_snext_reward( s_hash, a_desc) # add each event to transitions object gridworld.add_transition(s_hash, a_desc, sn_hash, t_prob=1.0, reward_obj=reward_val) gridworld.define_env_states_actions( ) # send all states and actions to environment # -------------------- s_hash_rowL = [] # layout rows for makeing 2D output for i in range(height): # put (0,0) at upper left rowL = [] for j in range(width): s = (i, j) if s in wallL: rowL.append('"Wall"') else: rowL.append(s) # use insert to put (0,0) at lower left s_hash_rowL.append(rowL) # layout rows for makeing 2D output named_s_hashD = {start: 'Start', goal: 'Goal'} gridworld.layout = GenericLayout(gridworld, s_hash_rowL=s_hash_rowL, named_s_hashD=named_s_hashD) gridworld.start_state_hash = start return gridworld
def get_gridworld( step_reward=0.0, N_mult=1, # N_mult must be an integer. width=9, height=6, goal=(0, 8), start=(2, 0), wallL=((1, 2), (2, 2), (3, 2), (0, 7), (1, 7), (2, 7), (4, 5))): gridworld = EnvBaseline( name='Sutton Ex8.4 Priority Sweep Maze') # GenericLayout set below gridworld.set_info("""Sutton Ex8.1 Dyna Maze""") width_big = width * N_mult height_big = height * N_mult gridworld.characteristic_dim = width_big + height_big * 2 # get relaxed optimal length from Zhang. gridworld.optimal_path_len = int(14 * N_mult * 1.2) + 1 def get_action_snext_reward(s_hash, action): """returns reward and state_next_hash""" di = 0 dj = 0 reward = 0 if action == 'U': di = -1 elif action == 'D': di = 1 elif action == 'R': dj = 1 elif action == 'L': dj = -1 (i, j) = s_hash i_next = i + di j_next = j + dj if j_next >= width_big: j_next = j elif j_next < 0: j_next = j if i_next >= height_big: i_next = i elif i_next < 0: i_next = i if (i_next, j_next) in wall_set: i_next, j_next = i, j state_next_hash = (i_next, j_next) if state_next_hash in goal_set: reward = 1.0 else: reward = 0.0 return reward, state_next_hash def make_big_set(pos): """Take an (i,j) position, pos, and expand to new, big size in x and y""" pos_set = set() ip, jp = pos ip *= N_mult jp *= N_mult for ixn in range(N_mult): for jxn in range(N_mult): pos_set.add((ip + ixn, jp + jxn)) return pos_set # define default policy gridworld.default_policyD = { } #index=s_hash, value=list of equiprobable actions # redefine start istart, jstart = start start = (istart * N_mult, jstart * N_mult) # make goal set goal_set = make_big_set(goal) # make wall set wall_set = set() for wall in wallL: wall_set.update(make_big_set(wall)) # create state hash entries for i in range(height_big): for j in range(width_big): s_hash = (i, j) if (s_hash not in wall_set) and (s_hash not in goal_set): gridworld.default_policyD[s_hash] = ('U', 'D', 'R', 'L') for a_desc in ['U', 'D', 'R', 'L']: gridworld.add_action( s_hash, a_desc, a_prob=1.0) # a_prob will be normalized reward_val, sn_hash = get_action_snext_reward( s_hash, a_desc) # add each event to transitions object gridworld.add_transition(s_hash, a_desc, sn_hash, t_prob=1.0, reward_obj=reward_val) gridworld.define_env_states_actions( ) # send all states and actions to environment # -------------------- s_hash_rowL = [] # layout rows for makeing 2D output for i in range(height_big): # put (0,0) at upper left rowL = [] for j in range(width_big): s = (i, j) if s in wall_set: rowL.append('"Wall"') else: rowL.append(s) # use insert to put (0,0) at lower left s_hash_rowL.append(rowL) # layout rows for makeing 2D output named_s_hashD = {} named_s_hashD[start] = 'Start' for g in goal_set: named_s_hashD[g] = 'Goal' gridworld.layout = GenericLayout(gridworld, s_hash_rowL=s_hash_rowL, named_s_hashD=named_s_hashD) gridworld.start_state_hash = start return gridworld
def get_env(): env = EnvBaseline( name="Jacks Car Rental (const rtn)" ) # GenericLayout set below simplified_str ="""Shangtong Zhang's simplified model such that the # of cars returned in daytime becomes constant rather than a random value from poisson distribution, which will reduce calculation time and leave the optimal policy/value state matrix almost the same""" env.set_info( 'Example 4.2 from Sutton & Barto 2nd Edition page 81.\n' + simplified_str ) # define all possible actions. saL = [] # a list of (s1, s2, adesc) s_hash_rowL = [] # layout rows for makeing 2D output for s1 in range( MAX_CARS + 1 ): # 20 cars max rowL = [] # row of s_hash_rowL for s2 in range( MAX_CARS + 1 ): # 20 cars max s_hash = (s1, s2) rowL.append( s_hash ) for a_desc in range(-5, 6): # -5 moves 5 cars from 2nd to 1st. +5 from 1st to 2nd. if a_desc < 0: # can only move cars if they are present if (abs(a_desc) <= s2): env.add_action( s_hash, a_desc, a_prob=1.0 ) saL.append( (s1, s2, a_desc) ) else: if (a_desc <= s1): # can only move cars if they are present env.add_action( s_hash, a_desc, a_prob=1.0 ) saL.append( (s1, s2, a_desc) ) # use insert to put (0,0) at lower left s_hash_rowL.insert(0, rowL )# layout rows for makeing 2D output # ------------------------------ # figure out transition probabilities and rewards for s1 in range( MAX_CARS + 1 ): for s2 in range( MAX_CARS + 1 ): for a_desc in range( -5, 6 ): get_prob_reward( s1, s2, a_desc) # ------------------------------ print('\nStarting to define car rental transitions') # with all the probability figured out, define all transitions for (s1, s2, a_desc, sn_hash), t_prob in total_probD.items(): txr = sum_prob_x_rewardD[ (s1, s2, a_desc, sn_hash) ] rval = txr / t_prob env.add_transition( (s1,s2), a_desc, sn_hash, t_prob=t_prob, reward_obj=rval) #if s1==10 and s2==10: # print('for (10,10) a_desc=',a_desc,' sn_hash=',sn_hash,' t_prob=',t_prob,' rval=',rval) print('Calling: env.define_env_states_actions') env.define_env_states_actions() # send all states and actions to environment print('Environment Ready.') # If there is a start state, define it here. env.start_state_hash = (10,10) # define default policy (if any) env.default_policyD = {} # -------------------- # define layout for output env.layout = GenericLayout( env, s_hash_rowL=s_hash_rowL, x_axis_label='#Cars at Second Location', y_axis_label='#Cars at First Location') return env
def get_random_walk(Nside_states=9, win_reward=1.0, lose_reward=-1.0, step_reward=0.0): Nstates = 2 * Nside_states + 1 s = '(L%i, R%i)' % (Nside_states, Nside_states) env = EnvBaseline(name='%i State Random Walk MRP' % Nstates + s) # GenericLayout set below env.set_info('%i State Random Walk MRP' % Nstates + s) RstateL = ['R+%i' % i for i in range(1, Nside_states + 1)] LstateL = list(reversed([s.replace('R+', 'L-') for s in RstateL])) actionD = {} for s in LstateL: actionD[s] = ('L', 'R') actionD['C'] = ('L', 'R') for s in RstateL: actionD[s] = ('L', 'R') rewardD = {'Win': win_reward, 'Lose': lose_reward} for (s_hash, moveL) in actionD.items(): for a_desc in moveL: env.add_action(s_hash, a_desc, a_prob=1.0) def add_event(s_hash, a_desc, sn_hash): #print('s_hash, a_desc, sn_hash',s_hash, a_desc, sn_hash) r = rewardD.get(sn_hash, step_reward) env.add_transition(s_hash, a_desc, sn_hash, t_prob=1.0, reward_obj=r) mrpL = ['Lose'] + LstateL + ['C'] + RstateL + ['Win'] for i, ci in enumerate(mrpL[1:-1]): add_event(ci, 'L', mrpL[i]) add_event(ci, 'R', mrpL[i + 2]) env.define_env_states_actions( ) # send all states and actions to environment # -------------------- make layout for printing ------------------ s_hash_rowL = [mrpL] env.layout = GenericLayout(env, s_hash_rowL=s_hash_rowL) env.start_state_hash = 'C' # define default_policyD policyD = {} # index=state_hash, value=action_desc policyD['C'] = ('L', 'R') for s in LstateL: policyD[s] = ('L', 'R') for s in RstateL: policyD[s] = ('L', 'R') env.default_policyD = policyD return env