def __init__(self, environment, init_val=0.0): """ A Collection of Action-Value, Q(s,a) floating point values for each state-action pair in the environment Each value can be updated with a learning rate (alpha) To get value use: qsa.get_val( s_hash, a_desc ): To update value use: sv.delta_update( s_hash, a_desc, delta) sv.sarsa_update( s_hash, a_desc, alpha, gamma, sn_hash, an_desc, reward) (Terminal States have Q(s,a) = 0.0) """ self.environment = environment self.QsaD = { } # index=s_hash value=aD (dict), aD index=a_desc, value=Q(s,a) value, float # aD index=a_desc, value=last change to Q(s,a) value, float self.last_delta_QsaD = {} # index=s_hash value=aD (dict) self.chgTracker = ChangeTracker() self.init_Qsa_to_val(init_val) self.init_val = init_val
def __init__(self, environment, n_state_vars=2, lo_valL=None, hi_valL=None, num_regionsL=None): self.environment = environment self.n_state_vars = n_state_vars if lo_valL is None: lo_valL = [0.0] * n_state_vars if hi_valL is None: hi_valL = [1.0] * n_state_vars if num_regionsL is None: num_regionsL = [5] * n_state_vars self.lo_valL = lo_valL self.hi_valL = hi_valL self.num_regionsL = num_regionsL self.tile = Tile(lo_valL=lo_valL, hi_valL=hi_valL, num_regionsL=num_regionsL) self.chgTracker = ChangeTracker() self.init_tracking() # initialize a weights numpy array with random values. self.init_w_vector() # e.g. self.w_vector = np.random.randn(self.N) / np.sqrt(self.N) self.N = len(self.w_vector)
def __init__(self, environment): self.environment = environment self.chgTracker = ChangeTracker() self.init_tracking() # initialize a weights numpy array with random values. self.init_w_vector() # e.g. self.w_vector = np.random.randn(self.N) / np.sqrt(self.N) self.N = len(self.w_vector)
def init_from_pickle_file(self, fname=None): # pragma: no cover """Initialize ActionValueColl from policy pickle file.""" saD, last_delta_QsaD, w_vector = self.read_pickle_file( fname=fname ) if saD: self.saD = saD self.w_vector = w_vector self.last_delta_QsaD = last_delta_QsaD self.N = len(self.w_vector) self.chgTracker = ChangeTracker() self.init_tracking() else: print('ERROR... Failed to read file:', fname)
def __init__(self, environment, init_val=0.0): """ A Collection of State-Value, V(s) for each state in the environment Each Update to V(s) is done with a learning-rate (alpha) To get value use: sv.get_Vs( s_hash ): To update value use: sv.delta_update( s_hash, delta) sv.mc_update( s_hash, alpha, G) sv.td0_update( s_hash, alpha, gamma, sn_hash, reward) etc. (Terminal States have V(s) = 0.0) Transition probabilities from (s,a) to (sn,reward) are collected as updates happen. """ self.environment = environment # check to see if the environment already contains trasition probabilities if hasattr(environment, 'iter_next_state_prob_reward'): self.env_has_transition_prob = True else: self.env_has_transition_prob = False self.VsD = {} # index=state_hash, value=state value, Vs (a float) self.define_statesD = { } # index=s_hash: value=ModelStateData object for s_hash # (used in error estimate) # Monte Carlo = Gt, discounted return # TD(0) = Rt+1 + gamma*V(st+1) (estimated discounted return) self.last_delta_VsD = {} # index=s_hash value=last change to s_hash self.chgTracker = ChangeTracker() self.init_Vs_to_val(init_val) self.init_val = init_val self.min_target = None # initialize when 1st target is submitted self.max_target = None
def __init__(self, environment): self.environment = environment # initialize known (s,a) pairs. self.saD = {} for s_hash in self.environment.iter_all_states(): for a_desc in self.environment.get_state_legal_action_list( s_hash ): # set dict value to index of numpy array self.saD[ (s_hash, a_desc) ] = len(self.saD) # aD index=a_desc, value=last change to Q(s,a) value, float self.last_delta_QsaD = {} # index=s_hash value=aD (dict) self.chgTracker = ChangeTracker() self.init_tracking() # initialize a weights numpy array with random values. self.init_w_vector() # e.g. self.w_vector = np.random.randn(self.N) / np.sqrt(self.N) self.N = len(self.w_vector)
class StateValueColl(object): def __init__(self, environment, init_val=0.0): """ A Collection of State-Value, V(s) for each state in the environment Each Update to V(s) is done with a learning-rate (alpha) To get value use: sv.get_Vs( s_hash ): To update value use: sv.delta_update( s_hash, delta) sv.mc_update( s_hash, alpha, G) sv.td0_update( s_hash, alpha, gamma, sn_hash, reward) etc. (Terminal States have V(s) = 0.0) Transition probabilities from (s,a) to (sn,reward) are collected as updates happen. """ self.environment = environment # check to see if the environment already contains trasition probabilities if hasattr(environment, 'iter_next_state_prob_reward'): self.env_has_transition_prob = True else: self.env_has_transition_prob = False self.VsD = {} # index=state_hash, value=state value, Vs (a float) self.define_statesD = { } # index=s_hash: value=ModelStateData object for s_hash # (used in error estimate) # Monte Carlo = Gt, discounted return # TD(0) = Rt+1 + gamma*V(st+1) (estimated discounted return) self.last_delta_VsD = {} # index=s_hash value=last change to s_hash self.chgTracker = ChangeTracker() self.init_Vs_to_val(init_val) self.init_val = init_val self.min_target = None # initialize when 1st target is submitted self.max_target = None def num_Vs(self): return len(self.VsD) def init_Vs_to_val(self, init_val): # initialize to init_val for all states, terminal = 0.0 for s_hash in self.environment.iter_all_states(): self.last_delta_VsD[s_hash] = 0.0 # record last change as 0.0 if s_hash in self.environment.terminal_set: self.VsD[s_hash] = 0.0 else: self.VsD[s_hash] = init_val def get_best_env_action(self, s_hash, a_descL): """Given env_has_transition_prob == True, find best action from given list.""" VsD = { } # will hold: index=a_desc, value=V(s) for all transitions of a from s # iterate over all actions from s, MUST include zero prob actions for a_desc in a_descL: calcd_v = 0.0 # iterate over the probability of going to next state, sn when action, a is taken for sn_hash, t_prob, reward in \ self.environment.iter_next_state_prob_reward(s_hash, a_desc, incl_zero_prob=False): # use probability-averaged V(sn) values from state_value_coll calcd_v += t_prob * (reward + self.VsD[sn_hash]) VsD[a_desc] = calcd_v best_a, best_a_val = argmax_vmax_dict(VsD) return best_a, best_a_val def get_best_blackbox_action(self, s_hash, a_descL): """Given env_has_transition_prob == False, find best action from given list.""" if s_hash in self.define_statesD: # index=s_hash: value=ModelStateData object for s_hash VsD = { } # will hold: index=a_desc, value=V(s) for all transitions of a from s PD = self.define_statesD[s_hash] for a_desc in a_descL: # select any actions not yet taken to start getting trasition data if a_desc not in PD.action_sn_rD: return a_desc, self.init_val # <--- Jumps the line to return unused actions. # if the action is deterministic (so far), just look up the current V(s) if PD.is_deterministic_action(a_desc): snD = PD.action_sn_rD[a_desc] sn_hash = tuple(snD.keys())[0] rwd_ave_obj = snD[sn_hash] VsD[a_desc] = rwd_ave_obj.get_ave() + self.VsD[sn_hash] else: # for stochastic actions, do a transition probability weighted calc of V(s) calcd_v = 0.0 a_count = PD.action_countD.get( a_desc, 0) # index=a_desc: value=count of (s,a) occurances if a_count > 0: if a_desc in PD.action_sn_rD: snD = PD.action_sn_rD[ a_desc] # snD... index=sn_hash: value=rwd_ave_obj for sn_hash, rwd_ave_obj in snD.items(): # fraction of times using a_desc in s_hash resulted in sn_hash t_prob = float( rwd_ave_obj.num_val) / float(a_count) calcd_v += t_prob * (rwd_ave_obj.get_ave() + self.VsD[sn_hash]) VsD[a_desc] = calcd_v best_a, best_a_val = argmax_vmax_dict(VsD) return best_a, best_a_val else: # this state has not yet been called so initialize transition tracking for it. for a_desc in a_descL: self.add_action(s_hash, a_desc) return a_desc, self.init_val def get_best_eps_greedy_action(self, s_hash, epsgreedy_obj=None): """ Pick the best action for state "s_hash" based on max V(s') If epsgreedy_obj is given, apply Epsilon Greedy logic to choice. """ a_descL = self.environment.get_state_legal_action_list(s_hash) if a_descL: if self.env_has_transition_prob: best_a_desc, best_a_val = self.get_best_env_action( s_hash, a_descL) else: best_a_desc, best_a_val = self.get_best_blackbox_action( s_hash, a_descL) if epsgreedy_obj is not None: best_a_desc = epsgreedy_obj(best_a_desc, a_descL) return best_a_desc return None def record_changes(self, s_hash, delta): """Keep track of changes made to V(s) values""" delta = abs(delta) # make sure that only absolute values are saved. # remove any record of last change to [s_hash] self.chgTracker.dec_change(self.last_delta_VsD[s_hash]) # add delta to tracking record self.chgTracker.inc_change(delta) # remember that delta was last change to [s_hash] self.last_delta_VsD[s_hash] = delta def get_snapshot(self): """ return a deep copy of the value dictionary. index=state_hash, value=state value, Vs (a float) """ return copy.deepcopy(self.VsD) def delta_update(self, s_hash='', delta=0.0): """Add delta to current value of s_hash""" self.VsD[s_hash] += delta self.record_changes(s_hash, delta) def add_action(self, s_hash, a_desc): """ Add an action to trasition data with call as follows. self.add_action( s_hash, a_desc ) """ if s_hash not in self.define_statesD: self.define_statesD[s_hash] = ModelStateData(s_hash) self.define_statesD[s_hash].add_action(a_desc) def save_action_results(self, s_hash, a_desc, sn_hash, reward_val): """Add sn_hash to possible next states and add to its RunningAve""" self.add_action(s_hash, a_desc) self.define_statesD[s_hash].save_action_results( a_desc, sn_hash, reward_val) def mc_update(self, s_hash='', alpha=0.1, G=0.0): """ Do a Monte-Carlo-style learning rate update. V(st) = V(st) + alpha * [Gt - V(st)] """ delta = alpha * (G - self.VsD[s_hash]) # allow key error self.VsD[s_hash] += delta self.record_changes(s_hash, delta) return abs(delta) # return the absolute value of change def td0_update(self, s_hash='', a_desc='', alpha=0.1, gamma=1.0, sn_hash='', reward=0.0): """ Do a TD(0), Temporal-Difference-style learning rate update. V(st) = V(st) + alpha * [R + gamma*V(st+1) - V(st)] Note: the a_desc input is provided in order to collect transition probability data. """ Vstp1 = self.VsD[sn_hash] target_val = reward + gamma * Vstp1 delta = alpha * (target_val - self.VsD[s_hash]) # allow key error self.VsD[s_hash] += delta self.record_changes(s_hash, delta) self.save_action_results(s_hash, a_desc, sn_hash, reward) return abs(delta) # return the absolute value of change def get_Vs(self, s_hash): """Return the current State-Value for s_hash""" return self.VsD[s_hash] # Allow key error def set_Vs(self, s_hash, Vs): """Set the current State-Value for s_hash""" self.VsD[s_hash] = Vs def calc_rms_error(self, true_valueD): """Using the dictionary, true_valueD as reference, calc RMS error.""" diff_sqL = [] for s_hash, true_val in true_valueD.items(): diff_sqL.append((true_val - self.VsD[s_hash])**2) rms = sqrt(sum(diff_sqL) / len(diff_sqL)) return rms def get_biggest_action_state_err(self): """Estimate the biggest error in all the state values.""" #print('self.chgTracker.get_biggest_change()', self.chgTracker.get_biggest_change()) return self.chgTracker.get_biggest_change() def make_pickle_filename(self, fname): """Make a file name ending with .vs2_pickle """ if fname is None: fname = self.name.replace(' ', '_') + '.vs2_pickle' else: fname = fname.replace(' ', '_').replace('.', '_') + '.vs2_pickle' return fname def save_to_pickle_file(self, fname=None): # pragma: no cover """Saves data to pickle file.""" # build name for pickle fname = self.make_pickle_filename(fname) saveD = {} saveD['VsD'] = self.VsD savedD['define_statesD'] = self.define_statesD fileObject = open(fname, 'wb') pickle.dump(saveD, fileObject, protocol=2) # protocol=2 is python 2&3 compatible. fileObject.close() print('Saved StateValueColl to file:', fname) def read_pickle_file(self, fname=None): # pragma: no cover """Reads data from pickle file.""" fname = self.make_pickle_filename(fname) if not os.path.isfile(fname): print('Pickle File NOT found:', fname) return False fileObject = open(fname, 'rb') readD = pickle.load(fileObject) VsD = readD['VsD'] define_statesD = readD['define_statesD'] fileObject.close() print('Read StateValueColl from file:', fname) return VsD, define_statesD def init_from_pickle_file(self, fname=None): # pragma: no cover """Initialize StateValueColl from policy pickle file.""" VsD, define_statesD = self.read_pickle_file(fname=fname) if VsD: self.VsD = VsD self.define_statesD = define_statesD self.chgTracker.clear() def get_policy(self): policy = Policy(environment=self.environment) for s_hash in self.environment.iter_all_action_states(): a_desc = self.get_best_eps_greedy_action(s_hash, epsgreedy_obj=None) policy.set_sole_action(s_hash, a_desc) return policy def summ_print(self, fmt_V='%g', none_str='*', show_states=True, show_last_change=True, show_policy=True): print() print('___ "%s" Alpha-Based State-Value Summary ___' % self.environment.name) if self.environment.layout is not None: # make summ_print using environment.layout if show_states: self.environment.layout.s_hash_print(none_str='*') row_tickL = self.environment.layout.row_tickL col_tickL = self.environment.layout.col_tickL x_axis_label = self.environment.layout.x_axis_label y_axis_label = self.environment.layout.y_axis_label rows_outL = [] last_delta_rows_outL = [] # if show_last_change == True for row in self.environment.layout.s_hash_rowL: outL = [] ld_outL = [] ld_outL.append(none_str) for s_hash in row: if not self.environment.is_legal_state(s_hash): if is_literal_str(s_hash): outL.append(s_hash[1:-1]) ld_outL.append(s_hash[1:-1]) else: outL.append(none_str) ld_outL.append(none_str) else: outL.append(fmt_V % self.VsD[s_hash]) delta = self.last_delta_VsD.get(s_hash, None) if delta is None: ld_outL.append('None') else: ld_outL.append(fmt_V % delta) rows_outL.append(outL) last_delta_rows_outL.append(ld_outL) print_string_rows(rows_outL, row_tickL=row_tickL, const_col_w=True, line_chr='_', left_pad=' ', col_tickL=col_tickL, header=self.environment.name + ' State-Value Summary, V(s)', x_axis_label=x_axis_label, y_axis_label=y_axis_label, justify='right') if show_last_change: print_string_rows(last_delta_rows_outL, row_tickL=row_tickL, const_col_w=True, line_chr='_', left_pad=' ', col_tickL=col_tickL, header=self.environment.name + ' Last Change to V(s) Summary', x_axis_label=x_axis_label, y_axis_label=y_axis_label, justify='right') if show_policy: policy = self.get_policy() policy.summ_print(verbosity=0, environment=self.environment) # ------------------------- simple output w/o a layout ------------ else: lmax_hash = 6 lmax_V = 6 outL = [] # list of tuples = (s_hash, V) for s_hash, V in self.VsD.items(): outL.append((s_hash, V)) lmax_hash = max(lmax_hash, len(str(s_hash))) lmax_V = max(lmax_V, len(fmt_V % V)) fmt_hash = '%' + '%is' % lmax_hash fmt_strV = '%' + '%is' % lmax_V outL.sort() # sort in-place for (s_hash, V) in outL: V = fmt_V % V print(' ', fmt_hash % str(s_hash), fmt_strV % V, end='') if show_last_change: print(' Last Delta = %s' % self.last_delta_VsD.get(s_hash, None)) else: print()
class Baseline_Q_Func( object ): """ Create a linear function for an environment that simply one-hot encodes all of the state-action pairs. OVERRIDE THIS for more interesting linear functions. This is only interesting for debugging linear function solution routines. (i.e. each term in the one-hot encoding should move to near the actual value function) """ # ======================== OVERRIDE STARTING HERE ========================== def init_w_vector(self): """Initialize the weights vector and the number of entries, N.""" # initialize a weights numpy array with random values. N = len(self.saD) self.w_vector = np.random.randn(N) / np.sqrt(N) self.N = len( self.w_vector ) def get_sa_x_vector(self, s_hash, a_desc): """ Return the x vector that represents the (s,a) pair. NOTE: the index into x_vector for (s,a) = self.saD[ (s_hash, a_desc) ] """ x_vector = np.zeros(self.N, dtype=np.float) x_vector[ self.saD[(s_hash, a_desc)] ] = 1.0 return x_vector # ======================== OVERRIDE ENDING HERE ========================== def QsaEst(self, s_hash, a_desc): """Return the current estimate for Q(s,a) from linear function eval.""" x_vector = self.get_sa_x_vector( s_hash, a_desc ) return self.w_vector.dot( x_vector ) def __init__(self, environment): self.environment = environment # initialize known (s,a) pairs. self.saD = {} for s_hash in self.environment.iter_all_states(): for a_desc in self.environment.get_state_legal_action_list( s_hash ): # set dict value to index of numpy array self.saD[ (s_hash, a_desc) ] = len(self.saD) # aD index=a_desc, value=last change to Q(s,a) value, float self.last_delta_QsaD = {} # index=s_hash value=aD (dict) self.chgTracker = ChangeTracker() self.init_tracking() # initialize a weights numpy array with random values. self.init_w_vector() # e.g. self.w_vector = np.random.randn(self.N) / np.sqrt(self.N) self.N = len(self.w_vector) def init_tracking(self): # initialize to init_val for all states, terminal = 0.0 for s_hash in self.environment.iter_all_states(): if s_hash not in self.saD: self.saD[s_hash] = {} self.last_delta_QsaD[s_hash] = {} # may not be any actions in terminal state, so set None action. if s_hash in self.environment.terminal_set: self.last_delta_QsaD[s_hash][ a_desc ] = 0.0 aL = self.environment.get_state_legal_action_list( s_hash ) for a_desc in aL: self.last_delta_QsaD[s_hash][ a_desc ] = 0.0 def get_number_of_changes(self): return self.chgTracker.get_number_of_changes() def num_Qsa(self): return len( self.saD ) def get_best_eps_greedy_action(self, s_hash, epsgreedy_obj=None ): """ Pick the best action for state "s_hash" based on max Q(s,a) If epsgreedy_obj is given, apply Epsilon Greedy logic to choice. """ a_descL = self.environment.get_state_legal_action_list( s_hash ) if a_descL: best_a_desc, best_a_val = a_descL[0], float('-inf') bestL = [best_a_desc] for a in a_descL: q = self.QsaEst( s_hash, a ) if q > best_a_val: best_a_desc, best_a_val = a, q bestL = [ a ] elif q == best_a_val: bestL.append( a ) best_a_desc = random.choice( bestL ) if epsgreedy_obj is not None: best_a_desc = epsgreedy_obj( best_a_desc, a_descL ) return best_a_desc return None def get_best_greedy_action(self, s_hash): return self.get_best_eps_greedy_action( s_hash ) def get_max_Qsa(self, s_hash): """return the maximum Q(s,a) for state, s_hash.""" a_best = self.get_best_greedy_action( s_hash ) if a_best is None: return None return self.QsaEst( s_hash, a_best ) def record_changes(self, s_hash, a_desc, delta ): """Keep track of changes made to Q(s,a) values""" delta = abs(delta) # make sure that only absolute values are saved. # remove any record of last change to [s_hash][a_desc] self.chgTracker.dec_change( self.last_delta_QsaD[s_hash][ a_desc ] ) # add delta to tracking record self.chgTracker.inc_change( delta ) # remember that delta was last change to [s_hash][a_desc] self.last_delta_QsaD[s_hash][ a_desc ] = delta def get_biggest_action_state_err(self): """Estimate the biggest error in all the action values.""" #print('self.chgTracker.get_biggest_change()', self.chgTracker.get_biggest_change()) return self.chgTracker.get_biggest_change() def get_max_last_delta_overall(self): """ get biggest entry in self.last_delta_QsaD # index=s_hash value=aD (dict)""" d_max = 0.0 for aD in self.last_delta_QsaD.values(): for val in aD.values(): d_max = max(d_max, abs(val)) return d_max def get_policy(self): policy = Policy( environment=self.environment ) for s_hash in self.environment.iter_all_action_states(): a_desc = self.get_best_greedy_action( s_hash ) policy.set_sole_action( s_hash, a_desc) return policy def get_gradient(self, s_hash, a_desc): """ Return the gradient of value function with respect to w_vector. Since the function is linear in w, the gradient is = x_vector. """ return self.get_sa_x_vector( s_hash, a_desc ) def sarsa_update(self, s_hash='', a_desc='', alpha=0.1, gamma=1.0, sn_hash='', an_desc='', reward=0.0): """ Do a SARSA, Temporal-Difference-style learning rate update. Use estimated Q(s,a) values by evaluating linear function approximation. w = w + alpha * [R + gamma*QEst(s',a') - QEst(s,a)] * grad(s,a) """ Qsat = self.QsaEst( s_hash, a_desc ) if sn_hash in self.environment.terminal_set: delta = alpha * (reward - Qsat) else: Qsatp1 = self.QsaEst( sn_hash, an_desc ) target_val = reward + gamma*Qsatp1 delta = alpha * (target_val - Qsat) delta_vector = delta * self.get_gradient( s_hash, a_desc ) self.w_vector += delta_vector # remember max amount of change due to [s_hash][a_desc] delta = np.max( np.absolute( delta_vector ) ) self.record_changes( s_hash, a_desc, delta ) return abs(delta) # return the absolute value of change def qlearning_update(self, s_hash='', a_desc='', sn_hash='', alpha=0.1, gamma=1.0, reward=0.0): """ Do a Q-Learning, Temporal-Difference-style learning rate update. Use estimated Q(s,a) values by evaluating linear function approximation. w = w + alpha * [R + gamma* max(QEst(s',a')) - QEst(s,a)] * grad(s,a) """ Qsat = self.QsaEst( s_hash, a_desc ) if sn_hash in self.environment.terminal_set: delta = alpha * (reward - Qsat) else: # find best Q(s',a') an_descL = self.environment.get_state_legal_action_list( sn_hash ) if an_descL: best_a_desc, best_a_val = an_descL[0], float('-inf') for a in an_descL: q = self.QsaEst( sn_hash, a ) if q > best_a_val: best_a_desc, best_a_val = a, q else: best_a_val = 0.0 # use best Q(s',a') to update Q(s,a) target_val = reward + gamma * best_a_val delta = alpha * (target_val - Qsat) delta_vector = delta * self.get_gradient( s_hash, a_desc ) self.w_vector += delta_vector # remember max amount of change due to [s_hash][a_desc] delta = np.max( np.absolute( delta_vector ) ) self.record_changes( s_hash, a_desc, delta ) return abs(delta) # return the absolute value of change # ========================== pickle routines =============================== def make_pickle_filename(self, fname): """Make a file name ending with .qlf_pickle """ if fname is None: fname = self.name.replace(' ','_') + '.qlf_pickle' else: fname = fname.replace(' ','_').replace('.','_') + '.qlf_pickle' return fname def save_to_pickle_file(self, fname=None): # pragma: no cover """Saves data to pickle file.""" # build name for pickle fname = self.make_pickle_filename( fname ) saveD = {} saveD['saD'] = self.saD saveD['last_delta_QsaD'] = self.last_delta_QsaD saveD['w_vector'] = self.w_vector fileObject = open(fname,'wb') pickle.dump(saveD,fileObject, protocol=2)# protocol=2 is python 2&3 compatible. fileObject.close() print('Saved ActionValueColl to file:',fname) def read_pickle_file(self, fname=None): # pragma: no cover """Reads data from pickle file.""" fname = self.make_pickle_filename( fname ) if not os.path.isfile( fname ): print('Pickle File NOT found:', fname) return False fileObject = open(fname,'rb') readD = pickle.load(fileObject) saD = readD['saD'] last_delta_QsaD = readD['last_delta_QsaD'] w_vector = readD['w_vector'] fileObject.close() print('Read ActionValueColl from file:',fname) return saD, last_delta_QsaD, w_vector def init_from_pickle_file(self, fname=None): # pragma: no cover """Initialize ActionValueColl from policy pickle file.""" saD, last_delta_QsaD, w_vector = self.read_pickle_file( fname=fname ) if saD: self.saD = saD self.w_vector = w_vector self.last_delta_QsaD = last_delta_QsaD self.N = len(self.w_vector) self.chgTracker = ChangeTracker() self.init_tracking() else: print('ERROR... Failed to read file:', fname) # ========================== summ_print =============================== def summ_print(self, fmt_Q='%.3f', none_str='*', show_states=True, show_last_change=True, show_policy=True): print() print('___ "%s" Action-Value Summary ___'%self.environment.name ) if self.environment.layout is not None: # make summ_print using environment.layout if show_states: self.environment.layout.s_hash_print( none_str='*' ) row_tickL = self.environment.layout.row_tickL col_tickL = self.environment.layout.col_tickL x_axis_label = self.environment.layout.x_axis_label y_axis_label = self.environment.layout.y_axis_label d_max = self.get_max_last_delta_overall() if d_max==0.0: d_max = 1.0E-10 rows_outL = [] last_delta_rows_outL = [] # if show_last_change == True for row in self.environment.layout.s_hash_rowL: outL = [] ld_outL = [] for s_hash in row: if not self.environment.is_legal_state( s_hash ): if is_literal_str( s_hash ): outL.append( s_hash[1:-1] ) ld_outL.append( s_hash[1:-1] ) else: outL.append( none_str ) ld_outL.append( none_str ) else: # s_hash is a legal state hash aL = self.environment.get_state_legal_action_list( s_hash ) sL = [str(s_hash)] ld_sL = [str(s_hash)] for a_desc in aL: qsa = self.QsaEst( s_hash, a_desc ) s = fmt_Q%qsa sL.append( '%s='%str(a_desc) + s.strip() ) try: d_val = int(100.0*self.last_delta_QsaD[s_hash].get( a_desc )/d_max) if d_val > 0: lds = '%i%%'%d_val ld_sL.append( '%s='%str(a_desc) + lds.strip() ) else: ld_sL.append( '%s~0'%str(a_desc) ) except: ld_sL.append( '%s=None'%str(a_desc) ) outL.append( '\n'.join(sL).strip() ) ld_outL.append( '\n'.join(ld_sL).strip() ) rows_outL.append( outL ) last_delta_rows_outL.append( ld_outL ) print_string_rows( rows_outL, row_tickL=row_tickL, const_col_w=True, line_chr='_', left_pad=' ', col_tickL=col_tickL, header=self.environment.name + ' Action-Value Summary, Q(s,a)', x_axis_label=x_axis_label, y_axis_label=y_axis_label, justify='right') if show_last_change: print_string_rows( last_delta_rows_outL, row_tickL=row_tickL, const_col_w=True, line_chr='_', left_pad=' ', col_tickL=col_tickL, header=self.environment.name + ' Last %% of Max Change to Q(s,a) Summary, (max change=%g)'%d_max, x_axis_label=x_axis_label, y_axis_label=y_axis_label, justify='right') if show_policy: policy = self.get_policy() policy.summ_print(verbosity=0, environment=self.environment) # ------------------------- simple output w/o a layout ------------ else: lmax_hash = 6 outL = [] # list of strings "(s_hash,a_desc)=Q" for s_hash in self.environment.iter_all_states(): aL = self.environment.get_state_legal_action_list( s_hash ) for a_desc in aL: qsa = self.QsaEst( s_hash, a_desc ) q = fmt_Q%qsa s = '(%s, %s)='%(str(s_hash),str(a_desc)) + q.strip() if show_last_change: s = s + ' Last Delta = %s'%self.last_delta_QsaD[s_hash].get( a_desc, None) outL.append( s ) lmax_hash = max(lmax_hash, len(s)) outL.sort() # sort in-place for s in outL: print(' ', s )
class Baseline_V_Func(object): """ Create a linear function for an environment that simply one-hot encodes all of the states. OVERRIDE THIS for more interesting linear functions. This is only interesting for debugging linear function solution routines. (i.e. each term in the one-hot encoding should move to near the actual value function) """ # ======================== OVERRIDE STARTING HERE ========================== def init_w_vector(self): """Initialize the weights vector and the number of entries, N.""" # initialize a weights numpy array with random values. N = len(self.sD) self.w_vector = np.random.randn(N) / np.sqrt(N) self.N = len(self.w_vector) def get_x_vector(self, s_hash): """ Return the x vector that represents the state, s_hash. NOTE: the index into x_vector for s_hash = self.sD[ s_hash ] """ x_vector = np.zeros(self.N, dtype=np.float) x_vector[self.sD[s_hash]] = 1.0 return x_vector # ======================== OVERRIDE ENDING HERE ========================== def VsEst(self, s_hash): """Return the current estimate for V(s) from linear function eval.""" x_vector = self.get_x_vector(s_hash) return self.w_vector.dot(x_vector) def __init__(self, environment): self.environment = environment self.chgTracker = ChangeTracker() self.init_tracking() # initialize a weights numpy array with random values. self.init_w_vector() # e.g. self.w_vector = np.random.randn(self.N) / np.sqrt(self.N) self.N = len(self.w_vector) def init_tracking(self): # initialize known states. self.sD = {} self.last_delta_VsD = {} # index=s_hash value=last change to s_hash # initialize to init_val for all states, terminal = 0.0 for s_hash in self.environment.iter_all_states(): # set dict value to index of numpy array self.sD[s_hash] = len(self.sD) self.last_delta_VsD[s_hash] = 0.0 def get_number_of_changes(self): return self.chgTracker.get_number_of_changes() def num_Vs(self): return len(self.sD) def record_changes(self, s_hash, delta): """Keep track of changes made to V(s) values""" delta = abs(delta) # make sure that only absolute values are saved. # remove any record of last change to [s_hash] self.chgTracker.dec_change(self.last_delta_VsD[s_hash]) # add delta to tracking record self.chgTracker.inc_change(delta) # remember that delta was last change to [s_hash] self.last_delta_VsD[s_hash] = delta def get_biggest_action_state_err(self): """Estimate the biggest error in all the action values.""" #print('self.chgTracker.get_biggest_change()', self.chgTracker.get_biggest_change()) return self.chgTracker.get_biggest_change() def get_max_last_delta_overall(self): """ get biggest entry in self.last_delta_VsD # index=s_hash value=aD (dict)""" d_max = 0.0 for aD in self.last_delta_VsD.values(): for val in aD.values(): d_max = max(d_max, abs(val)) return d_max def get_gradient(self, s_hash): """ Return the gradient of value function with respect to w_vector. Since the function is linear in w, the gradient is = x_vector. """ return self.get_x_vector(s_hash) def mc_update(self, s_hash='', alpha=0.1, G=0.0): """ Do a Monte-Carlo-style learning rate update. w = w + alpha * [Gt - Vhat(st)] * grad(st) """ Vs = self.VsEst(s_hash) delta = alpha * (G - Vs) delta_vector = delta * self.get_gradient(s_hash) self.w_vector += delta_vector delta = np.max(np.absolute(delta_vector)) self.record_changes(s_hash, delta) return abs(delta) # return the absolute value of change def td0_update(self, s_hash='', alpha=0.1, gamma=1.0, sn_hash='', reward=0.0): """ Do a TD(0), Temporal-Difference-style learning rate update. w = w + alpha * [R + gamma*VEst(s',w) - V(s,w)] * grad(s) """ Vs = self.VsEst(s_hash) if sn_hash in self.environment.terminal_set: target_val = reward else: Vstp1 = self.VsEst(sn_hash) target_val = reward + gamma * Vstp1 delta = alpha * (target_val - Vs) delta_vector = delta * self.get_gradient(s_hash) self.w_vector += delta_vector delta = np.max(np.absolute(delta_vector)) self.record_changes(s_hash, delta) return abs(delta) # return the absolute value of change # ========================== pickle routines =============================== def make_pickle_filename(self, fname): """Make a file name ending with .vlf_pickle """ if fname is None: fname = self.name.replace(' ', '_') + '.vlf_pickle' else: fname = fname.replace(' ', '_').replace('.', '_') + '.vlf_pickle' return fname def save_to_pickle_file(self, fname=None): # pragma: no cover """Saves data to pickle file.""" # build name for pickle fname = self.make_pickle_filename(fname) saveD = {} saveD['sD'] = self.sD saveD['last_delta_VsD'] = self.last_delta_VsD saveD['w_vector'] = self.w_vector fileObject = open(fname, 'wb') pickle.dump(saveD, fileObject, protocol=2) # protocol=2 is python 2&3 compatible. fileObject.close() print('Saved ActionValueColl to file:', fname) def read_pickle_file(self, fname=None): # pragma: no cover """Reads data from pickle file.""" fname = self.make_pickle_filename(fname) if not os.path.isfile(fname): print('Pickle File NOT found:', fname) return False fileObject = open(fname, 'rb') readD = pickle.load(fileObject) sD = readD['sD'] last_delta_VsD = readD['last_delta_VsD'] w_vector = readD['w_vector'] fileObject.close() print('Read ActionValueColl from file:', fname) return sD, last_delta_VsD, w_vector def init_from_pickle_file(self, fname=None): # pragma: no cover """Initialize ActionValueColl from policy pickle file.""" sD, last_delta_VsD, w_vector = self.read_pickle_file(fname=fname) if sD: self.sD = sD self.w_vector = w_vector self.last_delta_VsD = last_delta_VsD self.N = len(self.w_vector) self.chgTracker = ChangeTracker() self.init_tracking() else: print('ERROR... Failed to read file:', fname) # ========================== summ_print =============================== def summ_print(self, fmt_V='%g', none_str='*', show_states=True, show_last_change=True): print() print('___ "%s" Alpha-Based State-Value Summary ___' % self.environment.name) if self.environment.layout is not None: # make summ_print using environment.layout if show_states: self.environment.layout.s_hash_print(none_str='*') row_tickL = self.environment.layout.row_tickL col_tickL = self.environment.layout.col_tickL x_axis_label = self.environment.layout.x_axis_label y_axis_label = self.environment.layout.y_axis_label rows_outL = [] last_delta_rows_outL = [] # if show_last_change == True for row in self.environment.layout.s_hash_rowL: outL = [] ld_outL = [] ld_outL.append(none_str) for s_hash in row: if not self.environment.is_legal_state(s_hash): if is_literal_str(s_hash): outL.append(s_hash[1:-1]) ld_outL.append(s_hash[1:-1]) else: outL.append(none_str) ld_outL.append(none_str) else: outL.append(fmt_V % self.VsEst(s_hash)) delta = self.last_delta_VsD.get(s_hash, None) if delta is None: ld_outL.append('None') else: ld_outL.append(fmt_V % delta) rows_outL.append(outL) last_delta_rows_outL.append(ld_outL) print_string_rows(rows_outL, row_tickL=row_tickL, const_col_w=True, line_chr='_', left_pad=' ', col_tickL=col_tickL, header=self.environment.name + ' State-Value Summary, V(s)', x_axis_label=x_axis_label, y_axis_label=y_axis_label, justify='right') if show_last_change: print_string_rows(last_delta_rows_outL, row_tickL=row_tickL, const_col_w=True, line_chr='_', left_pad=' ', col_tickL=col_tickL, header=self.environment.name + ' Last Change to V(s) Summary', x_axis_label=x_axis_label, y_axis_label=y_axis_label, justify='right') # ------------------------- simple output w/o a layout ------------ else: lmax_hash = 6 lmax_V = 6 outL = [] # list of tuples = (s_hash, V) for s_hash, _ in self.VsD.items(): V = self.VsEst(s_hash) outL.append((s_hash, V)) lmax_hash = max(lmax_hash, len(str(s_hash))) lmax_V = max(lmax_V, len(fmt_V % V)) fmt_hash = '%' + '%is' % lmax_hash fmt_strV = '%' + '%is' % lmax_V outL.sort() # sort in-place for (s_hash, V) in outL: V = fmt_V % V print(' ', fmt_hash % str(s_hash), fmt_strV % V, end='') if show_last_change: print(' Last Delta = %s' % self.last_delta_VsD.get(s_hash, None)) else: print()
class ActionValueColl(object): def __init__(self, environment, init_val=0.0): """ A Collection of Action-Value, Q(s,a) floating point values for each state-action pair in the environment Each value can be updated with a learning rate (alpha) To get value use: qsa.get_val( s_hash, a_desc ): To update value use: sv.delta_update( s_hash, a_desc, delta) sv.sarsa_update( s_hash, a_desc, alpha, gamma, sn_hash, an_desc, reward) (Terminal States have Q(s,a) = 0.0) """ self.environment = environment self.QsaD = { } # index=s_hash value=aD (dict), aD index=a_desc, value=Q(s,a) value, float # aD index=a_desc, value=last change to Q(s,a) value, float self.last_delta_QsaD = {} # index=s_hash value=aD (dict) self.chgTracker = ChangeTracker() self.init_Qsa_to_val(init_val) self.init_val = init_val def get_number_of_changes(self): return self.chgTracker.get_number_of_changes() def merge_active_value_coll(self, av_coll_2): """Merge self and av_coll_2 into a single ActionValueColl object""" av_result = copy.deepcopy(self) for s_hash, aD in self.QsaD.items(): for a_desc, Q in aD.items(): av_result.QsaD[s_hash][a_desc] = (self.QsaD[s_hash][a_desc] +\ av_coll_2.QsaD[s_hash][a_desc]) / 2.0 return av_result def build_sv_from_av(self): """ Build a StateValueColl from this ActionValueColl NOTE: Any policy derived directly from the resulting StateValueColl will LIKELY BE DIFFERENT from a policy derived directly from this ActionValueColl. """ sv = StateValueColl(self.environment) for s_hash, aD in self.QsaD.items(): best_val = float('-inf') for a_desc, Q in aD.items(): if self.QsaD[s_hash][a_desc] > best_val: best_val = self.QsaD[s_hash][a_desc] sv.VsD[s_hash] = best_val return sv def num_Qsa(self): return len(self.QsaD) def init_Qsa_to_val(self, init_val): # initialize to init_val for all states, terminal = 0.0 for s_hash in self.environment.iter_all_states(): if s_hash not in self.QsaD: self.QsaD[s_hash] = {} self.last_delta_QsaD[s_hash] = {} # may not be any actions in terminal state, so set None action. if s_hash in self.environment.terminal_set: self.QsaD[s_hash][None] = 0.0 self.last_delta_QsaD[s_hash][a_desc] = 0.0 aL = self.environment.get_state_legal_action_list(s_hash) for a_desc in aL: self.last_delta_QsaD[s_hash][a_desc] = 0.0 # some terminal states have actions to themselves. if s_hash in self.environment.terminal_set: self.QsaD[s_hash][a_desc] = 0.0 else: self.QsaD[s_hash][a_desc] = init_val def get_best_eps_greedy_action(self, s_hash, epsgreedy_obj=None): """ Pick the best action for state "s_hash" based on max Q(s,a) If epsgreedy_obj is given, apply Epsilon Greedy logic to choice. """ a_descL = self.environment.get_state_legal_action_list(s_hash) if a_descL: best_a_desc, best_a_val = a_descL[0], float('-inf') bestL = [best_a_desc] for a in a_descL: q = self.QsaD[s_hash][a] if q > best_a_val: best_a_desc, best_a_val = a, q bestL = [a] elif q == best_a_val: bestL.append(a) best_a_desc = random.choice(bestL) if epsgreedy_obj is not None: best_a_desc = epsgreedy_obj(best_a_desc, a_descL) return best_a_desc return None def get_best_greedy_action(self, s_hash): return self.get_best_eps_greedy_action(s_hash) def get_max_Qsa(self, s_hash): """return the maximum Q(s,a) for state, s_hash.""" a_best = self.get_best_greedy_action(s_hash) if a_best is None: return None return self.get_val(s_hash, a_best) def get_best_dbl_eps_greedy_action(self, av_coll_2, s_hash, epsgreedy_obj=None): """ Pick the best action for state "s_hash" based on COMBINED max Q(s,a) If epsgreedy_obj is given, apply Epsilon Greedy logic to choice. """ a_descL = self.environment.get_state_legal_action_list(s_hash) if a_descL: best_a_desc, best_a_val = a_descL[0], float('-inf') bestL = [best_a_desc] for a in a_descL: q1 = self.QsaD[s_hash][a] q2 = av_coll_2.QsaD[s_hash][a] q = q1 + q2 if q > best_a_val: best_a_desc, best_a_val = a, q bestL = [a] elif q == best_a_val: bestL.append(a) best_a_desc = random.choice(bestL) if epsgreedy_obj is not None: best_a_desc = epsgreedy_obj(best_a_desc, a_descL) return best_a_desc return None def record_changes(self, s_hash, a_desc, delta): """Keep track of changes made to Q(s,a) values""" delta = abs(delta) # make sure that only absolute values are saved. # remove any record of last change to [s_hash][a_desc] self.chgTracker.dec_change(self.last_delta_QsaD[s_hash][a_desc]) # add delta to tracking record self.chgTracker.inc_change(delta) # remember that delta was last change to [s_hash][a_desc] self.last_delta_QsaD[s_hash][a_desc] = delta def get_biggest_action_state_err(self): """Estimate the biggest error in all the action values.""" #print('self.chgTracker.get_biggest_change()', self.chgTracker.get_biggest_change()) return self.chgTracker.get_biggest_change() def get_val(self, s_hash, a_desc): """Return the Action-Value for (s_hash, a_desc)""" return self.QsaD[s_hash][a_desc] # Allow key error def delta_update(self, s_hash='', a_desc='', delta=0.0): """ Add delta to current value of Q(s,a) for s_hash, a_desc """ self.QsaD[s_hash][a_desc] += delta # remember amount of change to [s_hash][a_desc] self.record_changes(s_hash, a_desc, delta) def qlearning_update(self, s_hash='', a_desc='', sn_hash='', alpha=0.1, gamma=1.0, reward=0.0): """ Do a Q-Learning, Temporal-Difference-style learning rate update. Q(s,a) = Q(s,a) + alpha * [R + gamma* max(Q(s',a')) - Q(s,a)] """ Qsat = self.QsaD[s_hash][a_desc] # allow key error # find best Q(s',a') an_descL = self.environment.get_state_legal_action_list(sn_hash) if an_descL: best_a_desc, best_a_val = an_descL[0], float('-inf') for a in an_descL: q = self.QsaD[sn_hash][a] if q > best_a_val: best_a_desc, best_a_val = a, q else: best_a_val = 0.0 # use best Q(s',a') to update Q(s,a) target_val = reward + gamma * best_a_val delta = alpha * (target_val - Qsat) self.QsaD[s_hash][a_desc] += delta # remember amount of change to [s_hash][a_desc] self.record_changes(s_hash, a_desc, delta) return abs(delta) # return the absolute value of change def dbl_qlearning_update(self, av_coll_2, s_hash='', a_desc='', sn_hash='', alpha=0.1, gamma=1.0, reward=0.0): """ Do a Double Q-Learning, Temporal-Difference-style learning rate update. Given a 2nd ActionValueColl, av_coll_2, update EITHER self, or av_coll_2. Q(s,a) = Q(s,a) + alpha * [R + gamma* max(Q(s',a')) - Q(s,a)] """ # randomly decide which Q(s,a) to update, self or av_coll_2 if random.random() < 0.5: # use best Q(s',a') to update "self" Q(s,a) Qsat = self.QsaD[s_hash][a_desc] # allow key error best_a_desc = self.get_best_greedy_action(sn_hash) q = av_coll_2.QsaD[sn_hash][best_a_desc] target_val = reward + gamma * q delta = alpha * (target_val - Qsat) self.QsaD[s_hash][a_desc] += delta # remember amount of change to [s_hash][a_desc] self.record_changes(s_hash, a_desc, delta) else: # use best Q(s',a') to update "av_coll_2" Q(s,a) Qsat = av_coll_2.QsaD[s_hash][a_desc] # allow key error best_a_desc = av_coll_2.get_best_greedy_action(sn_hash) q = self.QsaD[sn_hash][best_a_desc] target_val = reward + gamma * q delta = alpha * (target_val - Qsat) av_coll_2.QsaD[s_hash][a_desc] += delta # remember amount of change to [s_hash][a_desc] av_coll_2.record_changes(s_hash, a_desc, delta) return abs(delta) # return the absolute value of change def sarsa_update(self, s_hash='', a_desc='', alpha=0.1, gamma=1.0, sn_hash='', an_desc='', reward=0.0): """ Do a SARSA, Temporal-Difference-style learning rate update. Q(s,a) = Q(s,a) + alpha * [R + gamma*Q(s',a') - Q(s,a)] """ Qsat = self.QsaD[s_hash][a_desc] # allow key error Qsatp1 = self.QsaD[sn_hash][an_desc] target_val = reward + gamma * Qsatp1 delta = alpha * (target_val - Qsat) self.QsaD[s_hash][a_desc] += delta # remember amount of change to [s_hash][a_desc] self.record_changes(s_hash, a_desc, delta) return abs(delta) # return the absolute value of change def expected_sarsa_update(self, s_hash='', a_desc='', alpha=0.1, gamma=1.0, epsilon=0.1, sn_hash='', reward=0.0): """ Do an Expected SARSA, Temporal-Difference-style learning rate update. Q(s,a) = Q(s,a) + alpha * [R + gamma * Expected[Q(s',a')] - Q(s,a)] """ an_best = self.get_best_greedy_action(sn_hash) expected_val = (1.0 - epsilon) * self.QsaD[sn_hash][an_best] an_descL = self.environment.get_state_legal_action_list(sn_hash) if an_descL: frac = epsilon / len(an_descL) for an_desc in an_descL: expected_val += frac * self.QsaD[sn_hash][an_desc] target_val = reward + gamma * expected_val delta = alpha * (target_val - self.QsaD[s_hash][a_desc]) self.QsaD[s_hash][a_desc] += delta # remember amount of change to [s_hash][a_desc] self.record_changes(s_hash, a_desc, delta) return abs(delta) # return the absolute value of change def make_pickle_filename(self, fname): """Make a file name ending with .qsa_pickle """ if fname is None: fname = self.name.replace(' ', '_') + '.qsa_pickle' else: fname = fname.replace(' ', '_').replace('.', '_') + '.qsa_pickle' return fname def save_to_pickle_file(self, fname=None): # pragma: no cover """Saves data to pickle file.""" # build name for pickle fname = self.make_pickle_filename(fname) saveD = {} saveD['QsaD'] = self.QsaD fileObject = open(fname, 'wb') pickle.dump(saveD, fileObject, protocol=2) # protocol=2 is python 2&3 compatible. fileObject.close() print('Saved ActionValueColl to file:', fname) def read_pickle_file(self, fname=None): # pragma: no cover """Reads data from pickle file.""" fname = self.make_pickle_filename(fname) if not os.path.isfile(fname): print('Pickle File NOT found:', fname) return False fileObject = open(fname, 'rb') readD = pickle.load(fileObject) QsaD = readD['QsaD'] fileObject.close() print('Read ActionValueColl from file:', fname) return QsaD def init_from_pickle_file(self, fname=None): # pragma: no cover """Initialize ActionValueColl from policy pickle file.""" QsaD = self.read_pickle_file(fname=fname) if QsaD: self.QsaD = QsaD def get_max_last_delta_overall(self): """ get biggest entry in self.last_delta_QsaD # index=s_hash value=aD (dict)""" d_max = 0.0 for aD in self.last_delta_QsaD.values(): for val in aD.values(): d_max = max(d_max, abs(val)) return d_max def get_policy(self): policy = Policy(environment=self.environment) for s_hash in self.environment.iter_all_action_states(): a_desc = self.get_best_greedy_action(s_hash) policy.set_sole_action(s_hash, a_desc) return policy def summ_print(self, fmt_Q='%.3f', none_str='*', show_states=True, show_last_change=True, show_policy=True): print() print('___ "%s" Action-Value Summary ___' % self.environment.name) if self.environment.layout is not None: # make summ_print using environment.layout if show_states: self.environment.layout.s_hash_print(none_str='*') row_tickL = self.environment.layout.row_tickL col_tickL = self.environment.layout.col_tickL x_axis_label = self.environment.layout.x_axis_label y_axis_label = self.environment.layout.y_axis_label d_max = self.get_max_last_delta_overall() if d_max == 0.0: d_max = 1.0E-10 rows_outL = [] last_delta_rows_outL = [] # if show_last_change == True for row in self.environment.layout.s_hash_rowL: outL = [] ld_outL = [] for s_hash in row: if not self.environment.is_legal_state(s_hash): if is_literal_str(s_hash): outL.append(s_hash[1:-1]) ld_outL.append(s_hash[1:-1]) else: outL.append(none_str) ld_outL.append(none_str) else: aD = self.QsaD[s_hash] sL = [str(s_hash)] ld_sL = [str(s_hash)] for a_desc, qsa in aD.items(): s = fmt_Q % qsa sL.append('%s=' % str(a_desc) + s.strip()) try: d_val = int( 100.0 * self.last_delta_QsaD[s_hash].get(a_desc) / d_max) if d_val > 0: lds = '%i%%' % d_val ld_sL.append('%s=' % str(a_desc) + lds.strip()) else: ld_sL.append('%s~0' % str(a_desc)) except: ld_sL.append('%s=None' % str(a_desc)) outL.append('\n'.join(sL).strip()) ld_outL.append('\n'.join(ld_sL).strip()) rows_outL.append(outL) last_delta_rows_outL.append(ld_outL) print_string_rows(rows_outL, row_tickL=row_tickL, const_col_w=True, line_chr='_', left_pad=' ', col_tickL=col_tickL, header=self.environment.name + ' Action-Value Summary, Q(s,a)', x_axis_label=x_axis_label, y_axis_label=y_axis_label, justify='right') if show_last_change: print_string_rows( last_delta_rows_outL, row_tickL=row_tickL, const_col_w=True, line_chr='_', left_pad=' ', col_tickL=col_tickL, header=self.environment.name + ' Last %% of Max Change to Q(s,a) Summary, (max change=%g)' % d_max, x_axis_label=x_axis_label, y_axis_label=y_axis_label, justify='right') if show_policy: policy = self.get_policy() policy.summ_print(verbosity=0, environment=self.environment) # ------------------------- simple output w/o a layout ------------ else: lmax_hash = 6 outL = [] # list of strings "(s_hash,a_desc)=Q" for s_hash in self.QsaD.keys(): for a_desc, qsa in self.QsaD[s_hash].items(): q = fmt_Q % self.QsaD[s_hash][a_desc] s = '(%s, %s)=' % (str(s_hash), str(a_desc)) + q.strip() if show_last_change: s = s + ' Last Delta = %s' % self.last_delta_QsaD[ s_hash].get(a_desc, None) outL.append(s) lmax_hash = max(lmax_hash, len(s)) outL.sort() # sort in-place for s in outL: print(' ', s)