def __init__(self, name='Generic Environment', s_hash_rowL=None, row_tickL=None, x_axis_label='', col_tickL=None, y_axis_label='', colorD=None, basic_color='', mdp_file=None): """ A Basic Environment from which all others derive. If "s_hash_rowL" is input, it will be used to calc state layout colorD and basic_color affect the GenericLayout when saved as an image. """ self.name = name self.define_statesD = { } # index=s_hash: value=DefineStateMoves object for s_hash self.TC = TransitionColl(name=name + ' TransitionColl') self.default_policyD = None # may define later. # for convenience, make TransitionColl objects available locally self.SAC = self.TC.sa_coll # share sa_coll with TransitionColl self.AC = self.TC.action_coll # share action_coll with TransitionColl self.SC = self.TC.state_coll # share state_coll with TransitionColl self.define_environment() # can define a start state list smaller than all action states # (to create it, call define_limited_start_state_list( state_list ) self.defined_limited_start_state_list = None self.terminal_set, self.action_state_set = self.TC.get_terminal_set_and_action_set( ) self.info = """A Basic Environment for solving Reinforcement Learning Problems.""" self.layout = GenericLayout(self, s_hash_rowL=s_hash_rowL, row_tickL=row_tickL, x_axis_label=x_axis_label, col_tickL=col_tickL, y_axis_label=y_axis_label, colorD=colorD, basic_color=basic_color) self.failed_mdp_file_read = False if mdp_file is not None: if not self.read_pickle_file(mdp_file): print('WARNING... FAILED TO OPEN MDP FILE:', mdp_file) print('=' * 66) print('=' * 66) print('=' * 66) #sys.exit() self.failed_mdp_file_read = True
def get_env(): env = EnvBaseline(name='Simple Six State World') env.set_info('Simple Six State World') actionD = { 'A': ('U', ), 'B': ('ur', 'D'), '<C>': ('ur', 'dl'), 'D': ('ur', 'ul') } rewardD = {'A': -1.0, 'E': 0.5, 'F': 1.0} for (s_hash, moveL) in actionD.items(): for a_desc in moveL: env.add_action(s_hash, a_desc, a_prob=1.0) def add_event(s_hash, a_desc, sn_hash): r = rewardD.get(sn_hash, 0.0) env.add_transition(s_hash, a_desc, sn_hash, t_prob=1.0, reward_obj=r) add_event('A', 'U', 'B') #add_event( 'A', 'Te', 'E' ) add_event('B', 'D', 'A') add_event('B', 'ur', '<C>') add_event('<C>', 'dl', 'B') add_event('<C>', 'ur', 'D') add_event('D', 'ur', 'F') add_event('D', 'ul', 'E') env.define_env_states_actions( ) # send all states and actions to environment # -------------------- s_hash_rowL = [('*', 'E', '*', 'F'), ('*', '*', 'D', '*'), ('*', '<C>', '*', '*'), ('B', '*', '*', '*'), ('A', '*', '*', '*')] env.layout = GenericLayout(env, s_hash_rowL=s_hash_rowL) env.start_state_hash = '<C>' # define default_policyD policyD = {} # index=state_hash, value=action_desc policyD['B'] = 1 policyD['<C>'] = 1 policyD['D'] = 1 env.default_policyD = policyD return env
def get_random_walk(): env = EnvBaseline(name='Random Walk MRP') # GenericLayout set below env.set_info('Random Walk MRP') actionD = { 'A': ('L', 'R'), 'B': ('L', 'R'), 'C': ('L', 'R'), 'D': ('L', 'R'), 'E': ('L', 'R') } rewardD = {'Win': 1.0, 'Lose': 0.0} for (s_hash, moveL) in actionD.items(): for a_desc in moveL: env.add_action(s_hash, a_desc, a_prob=1.0) def add_event(s_hash, a_desc, sn_hash): #print('s_hash, a_desc, sn_hash',s_hash, a_desc, sn_hash) r = rewardD.get(sn_hash, 0.0) env.add_transition(s_hash, a_desc, sn_hash, t_prob=1.0, reward_obj=r) mrpL = ['Lose', 'A', 'B', 'C', 'D', 'E', 'Win'] for i, ci in enumerate(mrpL[1:-1]): add_event(ci, 'L', mrpL[i]) add_event(ci, 'R', mrpL[i + 2]) env.define_env_states_actions( ) # send all states and actions to environment # -------------------- s_hash_rowL = [mrpL] env.layout = GenericLayout(env, s_hash_rowL=s_hash_rowL) env.start_state_hash = 'C' # define default_policyD policyD = {} # index=state_hash, value=action_desc policyD['A'] = ('L', 'R') policyD['B'] = ('L', 'R') policyD['C'] = ('L', 'R') policyD['D'] = ('L', 'R') policyD['E'] = ('L', 'R') env.default_policyD = policyD return env
def define_environment(self): for state_hash in range(1, 3): for action_desc in [-1, 1]: self.add_action(state_hash, action_desc, a_prob=1.0) # a_prob will be normalized sn = state_hash + action_desc self.add_transition(state_hash, action_desc, sn, t_prob=1.0, reward_obj=rt) self.define_env_states_actions( ) # send all states and actions to environment self.start_state_hash = 12 self.layout = GenericLayout(self)
def __init__(self, name='Basic Sim', s_hash_rowL=None, row_tickL=None, col_tickL=None, x_axis_label='', y_axis_label='', colorD=None, basic_color='', start_time=0): """ A Black Box Interface to a Simulation """ self.name = name self.info = """A Black Box Interface to a Simulation.""" self.s_hash_rowL = s_hash_rowL self.row_tickL = row_tickL self.col_tickL = col_tickL self.x_axis_label = x_axis_label self.y_axis_label = y_axis_label self.colorD = colorD self.basic_color = basic_color # state hash is (# cars at 1st site, # cars at 2nd site) self.action_state_set = set([0, 1, 2, 3]) # a set of action state hashes self.terminal_set = set([4]) # a set of terminal state hashes if s_hash_rowL is None: self.layout = None # may have a layout object for display purposes. (e.g. GenericLayout) else: self.layout = GenericLayout(self, s_hash_rowL=s_hash_rowL, row_tickL=row_tickL, col_tickL=col_tickL, x_axis_label=x_axis_label, y_axis_label=y_axis_label, colorD=colorD, basic_color=basic_color) self.default_policyD = None # may define later.
def define_environment(self): # possible moves: ('U','ur','R','dr','D','dl','L','ul','Te') actionD = { 'A': ('U', ), #'Te'), 'B': ('ur', 'D'), '<C>': ('ur', 'dl'), 'D': ('ur', 'ul') } rewardD = {'A': -1.0, 'E': 0.5, 'F': 1.0} for (s_hash, moveL) in actionD.items(): for a_desc in moveL: self.add_action(s_hash, a_desc, a_prob=1.0) def add_event(s_hash, a_desc, sn_hash): r = rewardD.get(sn_hash, 0.0) self.add_transition(s_hash, a_desc, sn_hash, t_prob=1.0, reward_obj=r) add_event('A', 'U', 'B') #add_event( 'A', 'Te', 'E' ) add_event('B', 'D', 'A') add_event('B', 'ur', '<C>') add_event('<C>', 'dl', 'B') add_event('<C>', 'ur', 'D') add_event('D', 'ur', 'F') add_event('D', 'ul', 'E') self.define_env_states_actions( ) # send all states and actions to environment s_hash_rowL = [('*', 'E', '*', 'F'), ('*', '*', 'D', '*'), ('*', '<C>', '*', '*'), ('B', '*', '*', '*'), ('A', '*', '*', '*')] self.layout = GenericLayout(self, s_hash_rowL=s_hash_rowL)
def get_gridworld(step_reward=-1, height=7, goal=(3,7), windT=(0,0,0,1,1,1,2,2,1,0)): """ Windy Gridworld with (0,0) at lower left width is defined by length of windT tuple. """ gridworld = EnvBaseline( name='Windy Kings Gridworld' ) # GenericLayout set below gridworld.set_info( """""" ) width = len( windT ) def get_action_snext( s_hash, action): """returns state_next_hash""" di = 0 dj = 0 if 'N' in action: di = 1 elif 'S' in action: di = -1 if 'E' in action: dj = 1 elif 'W' in action: dj = -1 (i,j) = s_hash wind_di = windT[ j ] i_next = i + di # constrain basic move to be inside the grid i_next = max(0, min(height-1, i_next)) i_next += wind_di # add wind to constrained move. j_next = j + dj # constrain next position to be inside the grid i_next = max(0, min(height-1, i_next)) j_next = max(0, min(width-1, j_next)) state_next_hash = (i_next, j_next) if state_next_hash == goal: state_next_hash = 'Goal' return state_next_hash # define default policy gridworld.default_policyD = {} #index=s_hash, value=list of equiprobable actions for i in range(height): for j in range(width): s_hash = (i,j) if s_hash == goal: pass # s_hash == 'Goal' else: gridworld.default_policyD[ s_hash ] = ('N','S','E','W', 'NE','SE','SW','NW') for a_desc in ['N','S','E','W', 'NE','SE','SW','NW']: gridworld.add_action( s_hash, a_desc, a_prob=1.0 ) # a_prob will be normalized sn_hash = get_action_snext( s_hash, a_desc ) # add each event to transitions object gridworld.add_transition( s_hash, a_desc, sn_hash, t_prob=1.0, reward_obj=step_reward) gridworld.define_env_states_actions() # send all states and actions to environment # -------------------- s_hash_rowL = [] # layout rows for makeing 2D output for i in range(height): # put (0,0) at upper left rowL = [] for j in range(width): s_hash = (i,j) if s_hash == goal: s_hash = 'Goal' rowL.append( s_hash ) # use insert to put (0,0) at lower left, append for upper left s_hash_rowL.insert(0, rowL )# layout rows for makeing 2D output gridworld.layout = GenericLayout( gridworld, s_hash_rowL=s_hash_rowL, col_tickL=windT, x_axis_label='Upward Wind Speed' ) gridworld.start_state_hash = (3,0) return gridworld
def get_gridworld(step_reward=0.0): gridworld = EnvBaseline( name='Simple Grid World') # GenericLayout set below gridworld.set_info('Simple Grid World Example.') actionD = { (0, 0): ('D', 'R'), (0, 1): ('L', 'R'), (0, 2): ('L', 'D', 'R'), (1, 0): ('U', 'D'), (1, 2): ('U', 'D', 'R'), (2, 0): ('U', 'R'), (2, 1): ('L', 'R'), (2, 2): ('L', 'R', 'U'), (2, 3): ('L', 'U') } rewardD = {(0, 3): 1, (1, 3): -1} for state_hash, actionL in actionD.items(): for action_desc in actionL: gridworld.add_action(state_hash, action_desc, a_prob=1.0) # a_prob will be normalized a = action_desc s = state_hash if a == 'U': state_next_hash = (s[0] - 1, s[1]) elif a == 'D': state_next_hash = (s[0] + 1, s[1]) elif a == 'R': state_next_hash = (s[0], s[1] + 1) elif a == 'L': state_next_hash = (s[0], s[1] - 1) reward_val = rewardD.get(state_next_hash, step_reward) gridworld.add_transition(state_hash, action_desc, state_next_hash, t_prob=1.0, reward_obj=reward_val) gridworld.define_env_states_actions( ) # send all states and actions to environment gridworld.layout = GenericLayout( gridworld) # uses default "get_layout_row_col_of_state" # If there is a start state, define it here. gridworld.start_state_hash = (2, 0) gridworld.define_limited_start_state_list([(2, 0), (2, 2)]) # define default policy (if any) # Policy Dictionary for: GridWorld policyD = {} # index=state_hash, value=action_desc # Vpi shown for gamma=0.9 policyD[(0, 0)] = 'R' # Vpi=0.81 policyD[(1, 0)] = 'U' # Vpi=0.729 policyD[(0, 1)] = 'R' # Vpi=0.9 policyD[(0, 2)] = 'R' # Vpi=1.0 policyD[(1, 2)] = 'U' # Vpi=0.9 policyD[(2, 0)] = 'U' # Vpi=0.6561 policyD[(2, 2)] = 'U' # Vpi=0.81 policyD[(2, 1)] = 'R' # Vpi=0.729 policyD[(2, 3)] = 'L' # Vpi=0.729 gridworld.default_policyD = policyD return gridworld
def get_gridworld(step_reward=0.0, width=9, height=6, goal=(0, 8), start=(2, 0), wallL=((1, 2), (2, 2), (3, 2), (0, 7), (1, 7), (2, 7), (4, 5))): gridworld = EnvBaseline( name='Sutton Ex8.1 Dyna Maze') # GenericLayout set below gridworld.set_info("""Sutton Ex8.1 Dyna Maze""") def get_action_snext_reward(s_hash, action): """returns reward and state_next_hash""" di = 0 dj = 0 reward = 0 if action == 'U': di = -1 elif action == 'D': di = 1 elif action == 'R': dj = 1 elif action == 'L': dj = -1 (i, j) = s_hash i_next = i + di j_next = j + dj if j_next >= width: j_next = j elif j_next < 0: j_next = j if i_next >= height: i_next = i elif i_next < 0: i_next = i if (i_next, j_next) in wallL: i_next, j_next = i, j state_next_hash = (i_next, j_next) if state_next_hash == goal: reward = 1.0 else: reward = 0.0 return reward, state_next_hash # define default policy gridworld.default_policyD = { } #index=s_hash, value=list of equiprobable actions for i in range(height): for j in range(width): s_hash = (i, j) if s_hash != goal: gridworld.default_policyD[s_hash] = ('U', 'D', 'R', 'L') for a_desc in ['U', 'D', 'R', 'L']: gridworld.add_action( s_hash, a_desc, a_prob=1.0) # a_prob will be normalized reward_val, sn_hash = get_action_snext_reward( s_hash, a_desc) # add each event to transitions object gridworld.add_transition(s_hash, a_desc, sn_hash, t_prob=1.0, reward_obj=reward_val) gridworld.define_env_states_actions( ) # send all states and actions to environment # -------------------- s_hash_rowL = [] # layout rows for makeing 2D output for i in range(height): # put (0,0) at upper left rowL = [] for j in range(width): s = (i, j) if s in wallL: rowL.append('"Wall"') else: rowL.append(s) # use insert to put (0,0) at lower left s_hash_rowL.append(rowL) # layout rows for makeing 2D output named_s_hashD = {start: 'Start', goal: 'Goal'} gridworld.layout = GenericLayout(gridworld, s_hash_rowL=s_hash_rowL, named_s_hashD=named_s_hashD) gridworld.start_state_hash = start return gridworld
class EnvBaseline(object): def __init__(self, name='Generic Environment', s_hash_rowL=None, row_tickL=None, x_axis_label='', col_tickL=None, y_axis_label='', colorD=None, basic_color='', mdp_file=None): """ A Basic Environment from which all others derive. If "s_hash_rowL" is input, it will be used to calc state layout colorD and basic_color affect the GenericLayout when saved as an image. """ self.name = name self.define_statesD = { } # index=s_hash: value=DefineStateMoves object for s_hash self.TC = TransitionColl(name=name + ' TransitionColl') self.default_policyD = None # may define later. # for convenience, make TransitionColl objects available locally self.SAC = self.TC.sa_coll # share sa_coll with TransitionColl self.AC = self.TC.action_coll # share action_coll with TransitionColl self.SC = self.TC.state_coll # share state_coll with TransitionColl self.define_environment() # can define a start state list smaller than all action states # (to create it, call define_limited_start_state_list( state_list ) self.defined_limited_start_state_list = None self.terminal_set, self.action_state_set = self.TC.get_terminal_set_and_action_set( ) self.info = """A Basic Environment for solving Reinforcement Learning Problems.""" self.layout = GenericLayout(self, s_hash_rowL=s_hash_rowL, row_tickL=row_tickL, x_axis_label=x_axis_label, col_tickL=col_tickL, y_axis_label=y_axis_label, colorD=colorD, basic_color=basic_color) self.failed_mdp_file_read = False if mdp_file is not None: if not self.read_pickle_file(mdp_file): print('WARNING... FAILED TO OPEN MDP FILE:', mdp_file) print('=' * 66) print('=' * 66) print('=' * 66) #sys.exit() self.failed_mdp_file_read = True def get_policy_score(self, policy=None, start_state_hash=None, step_limit=1000): """ Given a Policy object, OR policy dictionary, apply it to the Environment and return a score Can iterate over limited_start_state_list, or simply start at start_state_hash. """ if policy is None: policy = self.SAC if start_state_hash is None: s_hash = self.start_state_hash else: s_hash = start_state_hash r_sum = 0.0 n_steps = 0 a_desc = policy.get(s_hash, None) #print( policy ) while (a_desc is not None) and (n_steps < step_limit): sn_hash, reward = self.get_action_snext_reward(s_hash, a_desc) try: # if reward is numeric, add to r_sum r_sum += reward except: pass n_steps += 1 s_hash = sn_hash a_desc = policy.get(s_hash, None) msg = '' # any special message(s) return (r_sum, n_steps, msg ) # can OVERRIDE this to return a more meaningful score. def make_pickle_filename(self, fname): """Make a file name ending with .mdp_pickle """ if fname is None: fname = self.name.replace(' ', '_').replace('.', '_') + '.mdp_pickle' else: fname = fname.replace(' ', '_').replace('.', '_') + '.mdp_pickle' return fname def save_to_pickle_file(self, fname=None): # pragma: no cover """Saves data to pickle file.""" #raise ValueError( 'save_to_pickle_file is BROKEN... DO NOT USE' ) fname = self.make_pickle_filename(fname) print('Saving Environment to pickle file:', fname) saveD = {} saveD['name'] = self.name saveD['define_statesD'] = self.define_statesD saveD['info'] = self.info saveD['layout'] = self.layout if hasattr(self, 'start_state_hash'): saveD['start_state_hash'] = self.start_state_hash if hasattr(self, 'defined_limited_start_state_list'): saveD[ 'defined_limited_start_state_list'] = self.defined_limited_start_state_list fileObject = open(fname, 'wb') pickle.dump(saveD, fileObject, protocol=2) # protocol=2 is python 2&3 compatible. fileObject.close() def read_pickle_file(self, fname=None): # pragma: no cover """Reads data from pickle""" #raise ValueError( 'read_pickle_file is BROKEN... DO NOT USE' ) fname = self.make_pickle_filename(fname) if os.path.isfile(fname): pass # all good elif os.path.isfile(os.path.join(mdp_path, fname)): fname = os.path.join(mdp_path, fname) else: print('Pickle File NOT found:', fname) print('mdp_path:', mdp_path) s = '''Try running: "introrl_build_mdp" to create MDP Pickle Files. Type: introrl_build_mdp at the command line.''' banner(s, banner_char='', leftMargin=0, just='center') return False fileObject = open(fname, 'rb') readD = pickle.load(fileObject) self.name = readD['name'] self.define_statesD = readD['define_statesD'] self.info = readD['info'] self.layout = readD['layout'] if 'start_state_hash' in readD: self.start_state_hash = readD['start_state_hash'] if 'defined_limited_start_state_list' in readD: self.defined_limited_start_state_list = readD[ 'defined_limited_start_state_list'] self.define_env_states_actions( ) # use define_statesD to initialize data structures # ---------------------- fileObject.close() return True def set_info(self, info): """Input string that describes Environment.""" self.info = info def get_info(self): lmax = max([len(s) for s in self.info.split('\n')]) lmax = max(16, lmax) return '\n' + 'INFO'.center( lmax, '_') + '\n' + self.info + '\n' + '_' * lmax + '\n' def add_action_dict(self, actionD): """ iterate through dictionary of actions calling "add_action" for each one. actionD, index=s_hash, value=list of a_desc """ for s_hash, aL in actionD.items(): a_prob = 1.0 / float(len(aL)) for a_desc in aL: self.add_action(s_hash, a_desc, a_prob=a_prob) def add_action(self, s_hash, a_desc, a_prob=1.0): if s_hash not in self.define_statesD: self.define_statesD[s_hash] = DefineStateMoves(s_hash) self.define_statesD[s_hash].add_action(a_desc, a_prob) def add_transition(self, s_hash, a_desc, snext_hash, t_prob=1.0, reward_obj=Reward(const=0.0)): if s_hash not in self.define_statesD: self.define_statesD[s_hash] = DefineStateMoves(s_hash) self.define_statesD[s_hash].add_transition(a_desc, snext_hash, t_prob, reward_obj) def define_env_states_actions(self): """ Will Set or Add prob and Reward entries to sn_probD and sn_rewardD action_prob controls the probability of picking an action from a list of actions. i.e. if in state s, there can be a list of (a1,p1), (a2,p2), (a3,p3), etc. trans_prob controls the probability of picking next state from a list of next states. i.e. if taking action a in state s, there can be a list of (sn1,p1), (sn2,p2), (sn3,p3), etc. The Reward object is always associated with (s,a,sn), however, it can vary with probabilty distributions of its own. """ for (s_hash, DSM) in self.define_statesD.items(): DSM.add_to_environment(self) # with TC updated, recalc terminal_set self.terminal_set, self.action_state_set = self.TC.get_terminal_set_and_action_set( ) def define_environment(self): # pragma: no cover """OVERRIDE THIS in order to define the environment.""" # set up environment with calls to: # self.add_action( s_hash, a_desc, a_prob=1.0 ) # self.add_transition( s_hash, a_desc, sn_hash, t_prob=1.0, reward_obj=Reward( const=0.0 )) # ... OR ... Reward Object can be replaced with constant float # self.add_transition( s_hash, a_desc, sn_hash, t_prob=1.0, reward_obj=0.0) # self.define_env_states_actions() # ------------------- # layout object is usually created here in child objects. # #self.layout = GenericLayout( self, s_hash_rowL=None ) self.start_state_hash = (0, 0) # place holder def limited_start_state_list(self): """ Return a limited list of starting states. Normally used by agents that need to discover the various states in an environment, like epsilon-greedy. OVERRIDE THIS to return a list of states smaller than ALL ACTION STATES. """ if self.defined_limited_start_state_list is None: return self.get_all_action_state_hashes() else: return self.defined_limited_start_state_list[:] # return a copy def define_limited_start_state_list(self, state_list): # can define a start state list smaller than all action states # (call define_limited_start_state_list( state_list ) self.defined_limited_start_state_list = state_list def get_start_state_hash(self): """Assume that the value of start_state_hash has been set... so now just return it.""" if self.start_state_hash is None: # use a random state if none provided. return self.SC.get_random_state( ) # NOTE: this "fall-back" might return a terminal state. return self.start_state_hash def get_set_of_all_terminal_state_hashes(self): """ Return a set of terminal state hash values. OR empty set. (No non-terminal states should be included.) Primarily used to detect the end of an episode. """ # just to make sure it's "fresh", update terminal_set self.terminal_set, self.action_state_set = self.TC.get_terminal_set_and_action_set( ) return self.terminal_set def get_all_action_state_hashes(self): """ Return a list of action state hash values. OR empty list. (No terminal states should be included.) """ return [ s_hash for s_hash in self.SC.iter_state_hash() if s_hash not in self.terminal_set ] def get_any_action_state_hash(self): """ Return a action state hash at random. """ return random.choice(tuple(self.action_state_set)) def get_action_snext_reward(self, s_hash, a_desc): """Get (next state hash, float reward) by taking action "a_desc" in state "s_hash" """ sn_hash = self.TC.get_prob_weighted_next_state_hash(s_hash, a_desc) reward = self.TC.get_reward_value(s_hash, a_desc, sn_hash) return sn_hash, reward # (next state hash, float reward) def get_state_legal_action_list(self, s_hash): """ Return a list of possible actions from this state. Include any actions thought to be zero probability. (OR Empty list, if there are no actions) """ return self.SAC.get_list_of_all_action_desc(s_hash, incl_zero_prob=True) def get_default_policy_desc_dict(self): """ If the environment has a default policy, return it as a dictionary index=state_hash, value=action_desc NOTE: for deterministic policy, probability of each action is 1.0 so do not need to return tuples of (action, probability) """ # Policy Dictionary if self.default_policyD is None: return {} else: return self.default_policyD def get_num_states(self): return len(self.action_state_set) + len(self.terminal_set) def get_num_action_states(self): return len(self.action_state_set) def get_num_terminal_states(self): return len(self.terminal_set) def iter_all_action_states(self, randomize=False): """iterate over all action states in environment""" if randomize: for s_hash in random.sample(self.action_state_set, len(self.action_state_set)): yield s_hash # assume none in terminal_set else: for s_hash in self.action_state_set: yield s_hash # assume none in terminal_set def iter_all_terminal_states(self): """iterate over all terminal states in environment""" for s_hash in self.terminal_set: yield s_hash # assume none in action_state_set def is_legal_state(self, s_hash): return s_hash in self.SC.stateD def is_terminal_state(self, s_hash): return s_hash in self.terminal_set def iter_all_states(self): """iterate over all states in environment""" for s_hash in self.iter_all_action_states(): yield s_hash # assume none in terminal_set for s_hash in self.iter_all_terminal_states(): yield s_hash # assume none in action_state_set def iter_state_hash_action_desc(self): """Iterate over all the (s,a) pairs in the environment""" for s_hash, a_desc in self.TC.transitionsD.keys(): yield s_hash, a_desc def iter_action_desc_prob(self, s_hash, incl_zero_prob=False): """ Iterate over all (action_desc, prob) pairs. if incl_zero_prob==True, include actions with zero probability. """ for (a_desc, a_prob) in self.SAC.iter_action_desc_prob( s_hash, incl_zero_prob=incl_zero_prob): yield a_desc, a_prob def iter_next_state_prob_reward(self, s_hash, a_desc, incl_zero_prob=False): """ Iterate over all (next_state_obj, prob, reward) tuples if incl_zero_prob==True, include actions with zero probability. """ T = self.TC.get_transition_obj(s_hash, a_desc) for sn_hash, t_prob, reward in T.iter_sn_hash_prob_reward(): yield sn_hash, t_prob, reward def get_layout_row_col_of_state(self, s_hash): # can be s_hash OR State object """ --> OVERRIDE THIS FOR ANY SPECIALTY LAYOUTS <-- Normally it's best to simply input "s_hash_rowL" to define layout. return an (i,j) tuple describing the location of s_hash in the layout. The upper left corner is (0,0) such that: i is the index to the row in "s_hash_rowL". """ # in case a State object is input instead of s_hash, simply fix it if isinstance(s_hash, State): s_hash = State.hash # some grid layouts can use this default (i,j) try: (i, j) = s_hash i = int(i) j = int(j) return i, j # (row, col) except: index = self.SC.get_state_co_index(s_hash) #print('for s_hash=',s_hash,' index=',index) if index is None: return None, None # looks bad for (row, col) n_states = len(self.SC) if n_states <= 16: return divmod(index, 4) # (row, col) else: len_row = 1 + int(math.sqrt(n_states)) return divmod(index, len_row) # (row, col) def get_estimated_rewards(self): """ Return a dictionary of estimated rewards for each state. AND a dictionary of any special message (Will be exact for deterministic environment) """ est_rD = {} # index=s_hash, value=float reward estimate. msgD = {} # index=s_hash, value=any special message # initialize all rewards to zero for all states. for S in self.SC.iter_states(): est_rD[S.hash] = RunningAve(S.hash) for s_hash, a_desc, T in self.TC.iter_all_transitions(): for sn_hash, t_prob, reward in T.iter_sn_hash_prob_reward(): Robj = T.get_reward_obj(sn_hash) if Robj.reward_type == CONST: est_rD[sn_hash].add_val(reward) else: msgD[sn_hash] = 'est' # if the reward is stochastic, average 100 values for i in range(100): est_rD[sn_hash].add_val(Robj()) # Need to convert RunningAve objects to float for (s_hash, RA) in est_rD.items(): est_rD[s_hash] = RA.get_ave() #print(s_hash, RA) return est_rD, msgD def summ_print(self, long=True): print('___ "%s" Environment Summary ___' % self.name) #term_set, action_set = self.TC.get_terminal_set_and_action_set() #print('Passing term_set ',term_set,' to StateColl summ_print.', type(term_set)) self.SC.summ_print(terminal_set=self.terminal_set) self.AC.summ_print() if long: self.TC.summ_print() if self.layout is not None: self.layout_print(vname='reward', fmt='', show_env_states=True, none_str='*') def layout_print(self, vname='reward', fmt='', show_env_states=True, none_str='*'): """print the value "vname" formatted by the environment layout (if present). """ if self.layout is None: print( '...ERROR... "%s" tried to layout_print w/o a defined layout' % self.name) return if show_env_states: self.layout.s_hash_print(none_str=none_str) msgD = {} # initialize special message dictionary to empty if vname == 'reward': valD, msgD = self.get_estimated_rewards( ) # index=s_hash, value=float reward estimate. else: valD = {} # empty if not recognized vname x_axis_label = self.layout.x_axis_label y_axis_label = self.layout.y_axis_label row_tickL = self.layout.row_tickL col_tickL = self.layout.col_tickL rows_outL = [] for row in self.layout.s_hash_rowL: outL = [] for s_hash in row: if s_hash not in self.SC.stateD: if is_literal_str(s_hash): outL.append(s_hash[1:-1]) else: outL.append(none_str) else: val = valD.get(s_hash, None) if val is None: outL.append(none_str) else: if fmt: outL.append(fmt % val) else: outL.append(str(val)) if msgD.get(s_hash, ''): outL[-1] = outL[-1] + msgD.get(s_hash, '') rows_outL.append(outL) if rows_outL: print_string_rows(rows_outL, const_col_w=True, line_chr='_', left_pad=' ', y_axis_label=y_axis_label, row_tickL=row_tickL, col_tickL=col_tickL, header=self.name + ' %s Summary' % vname.title(), x_axis_label=x_axis_label, justify='right')
def get_gridworld( step_reward=0.0, N_mult=1, # N_mult must be an integer. width=9, height=6, goal=(0, 8), start=(2, 0), wallL=((1, 2), (2, 2), (3, 2), (0, 7), (1, 7), (2, 7), (4, 5))): gridworld = EnvBaseline( name='Sutton Ex8.4 Priority Sweep Maze') # GenericLayout set below gridworld.set_info("""Sutton Ex8.1 Dyna Maze""") width_big = width * N_mult height_big = height * N_mult gridworld.characteristic_dim = width_big + height_big * 2 # get relaxed optimal length from Zhang. gridworld.optimal_path_len = int(14 * N_mult * 1.2) + 1 def get_action_snext_reward(s_hash, action): """returns reward and state_next_hash""" di = 0 dj = 0 reward = 0 if action == 'U': di = -1 elif action == 'D': di = 1 elif action == 'R': dj = 1 elif action == 'L': dj = -1 (i, j) = s_hash i_next = i + di j_next = j + dj if j_next >= width_big: j_next = j elif j_next < 0: j_next = j if i_next >= height_big: i_next = i elif i_next < 0: i_next = i if (i_next, j_next) in wall_set: i_next, j_next = i, j state_next_hash = (i_next, j_next) if state_next_hash in goal_set: reward = 1.0 else: reward = 0.0 return reward, state_next_hash def make_big_set(pos): """Take an (i,j) position, pos, and expand to new, big size in x and y""" pos_set = set() ip, jp = pos ip *= N_mult jp *= N_mult for ixn in range(N_mult): for jxn in range(N_mult): pos_set.add((ip + ixn, jp + jxn)) return pos_set # define default policy gridworld.default_policyD = { } #index=s_hash, value=list of equiprobable actions # redefine start istart, jstart = start start = (istart * N_mult, jstart * N_mult) # make goal set goal_set = make_big_set(goal) # make wall set wall_set = set() for wall in wallL: wall_set.update(make_big_set(wall)) # create state hash entries for i in range(height_big): for j in range(width_big): s_hash = (i, j) if (s_hash not in wall_set) and (s_hash not in goal_set): gridworld.default_policyD[s_hash] = ('U', 'D', 'R', 'L') for a_desc in ['U', 'D', 'R', 'L']: gridworld.add_action( s_hash, a_desc, a_prob=1.0) # a_prob will be normalized reward_val, sn_hash = get_action_snext_reward( s_hash, a_desc) # add each event to transitions object gridworld.add_transition(s_hash, a_desc, sn_hash, t_prob=1.0, reward_obj=reward_val) gridworld.define_env_states_actions( ) # send all states and actions to environment # -------------------- s_hash_rowL = [] # layout rows for makeing 2D output for i in range(height_big): # put (0,0) at upper left rowL = [] for j in range(width_big): s = (i, j) if s in wall_set: rowL.append('"Wall"') else: rowL.append(s) # use insert to put (0,0) at lower left s_hash_rowL.append(rowL) # layout rows for makeing 2D output named_s_hashD = {} named_s_hashD[start] = 'Start' for g in goal_set: named_s_hashD[g] = 'Goal' gridworld.layout = GenericLayout(gridworld, s_hash_rowL=s_hash_rowL, named_s_hashD=named_s_hashD) gridworld.start_state_hash = start return gridworld
def get_env(): env = EnvBaseline( name="Jacks Car Rental (const rtn)" ) # GenericLayout set below simplified_str ="""Shangtong Zhang's simplified model such that the # of cars returned in daytime becomes constant rather than a random value from poisson distribution, which will reduce calculation time and leave the optimal policy/value state matrix almost the same""" env.set_info( 'Example 4.2 from Sutton & Barto 2nd Edition page 81.\n' + simplified_str ) # define all possible actions. saL = [] # a list of (s1, s2, adesc) s_hash_rowL = [] # layout rows for makeing 2D output for s1 in range( MAX_CARS + 1 ): # 20 cars max rowL = [] # row of s_hash_rowL for s2 in range( MAX_CARS + 1 ): # 20 cars max s_hash = (s1, s2) rowL.append( s_hash ) for a_desc in range(-5, 6): # -5 moves 5 cars from 2nd to 1st. +5 from 1st to 2nd. if a_desc < 0: # can only move cars if they are present if (abs(a_desc) <= s2): env.add_action( s_hash, a_desc, a_prob=1.0 ) saL.append( (s1, s2, a_desc) ) else: if (a_desc <= s1): # can only move cars if they are present env.add_action( s_hash, a_desc, a_prob=1.0 ) saL.append( (s1, s2, a_desc) ) # use insert to put (0,0) at lower left s_hash_rowL.insert(0, rowL )# layout rows for makeing 2D output # ------------------------------ # figure out transition probabilities and rewards for s1 in range( MAX_CARS + 1 ): for s2 in range( MAX_CARS + 1 ): for a_desc in range( -5, 6 ): get_prob_reward( s1, s2, a_desc) # ------------------------------ print('\nStarting to define car rental transitions') # with all the probability figured out, define all transitions for (s1, s2, a_desc, sn_hash), t_prob in total_probD.items(): txr = sum_prob_x_rewardD[ (s1, s2, a_desc, sn_hash) ] rval = txr / t_prob env.add_transition( (s1,s2), a_desc, sn_hash, t_prob=t_prob, reward_obj=rval) #if s1==10 and s2==10: # print('for (10,10) a_desc=',a_desc,' sn_hash=',sn_hash,' t_prob=',t_prob,' rval=',rval) print('Calling: env.define_env_states_actions') env.define_env_states_actions() # send all states and actions to environment print('Environment Ready.') # If there is a start state, define it here. env.start_state_hash = (10,10) # define default policy (if any) env.default_policyD = {} # -------------------- # define layout for output env.layout = GenericLayout( env, s_hash_rowL=s_hash_rowL, x_axis_label='#Cars at Second Location', y_axis_label='#Cars at First Location') return env
def get_random_walk(Nside_states=9, win_reward=1.0, lose_reward=-1.0, step_reward=0.0): Nstates = 2 * Nside_states + 1 s = '(L%i, R%i)' % (Nside_states, Nside_states) env = EnvBaseline(name='%i State Random Walk MRP' % Nstates + s) # GenericLayout set below env.set_info('%i State Random Walk MRP' % Nstates + s) RstateL = ['R+%i' % i for i in range(1, Nside_states + 1)] LstateL = list(reversed([s.replace('R+', 'L-') for s in RstateL])) actionD = {} for s in LstateL: actionD[s] = ('L', 'R') actionD['C'] = ('L', 'R') for s in RstateL: actionD[s] = ('L', 'R') rewardD = {'Win': win_reward, 'Lose': lose_reward} for (s_hash, moveL) in actionD.items(): for a_desc in moveL: env.add_action(s_hash, a_desc, a_prob=1.0) def add_event(s_hash, a_desc, sn_hash): #print('s_hash, a_desc, sn_hash',s_hash, a_desc, sn_hash) r = rewardD.get(sn_hash, step_reward) env.add_transition(s_hash, a_desc, sn_hash, t_prob=1.0, reward_obj=r) mrpL = ['Lose'] + LstateL + ['C'] + RstateL + ['Win'] for i, ci in enumerate(mrpL[1:-1]): add_event(ci, 'L', mrpL[i]) add_event(ci, 'R', mrpL[i + 2]) env.define_env_states_actions( ) # send all states and actions to environment # -------------------- make layout for printing ------------------ s_hash_rowL = [mrpL] env.layout = GenericLayout(env, s_hash_rowL=s_hash_rowL) env.start_state_hash = 'C' # define default_policyD policyD = {} # index=state_hash, value=action_desc policyD['C'] = ('L', 'R') for s in LstateL: policyD[s] = ('L', 'R') for s in RstateL: policyD[s] = ('L', 'R') env.default_policyD = policyD return env