def plot_grid_numbers(rows_outL, header='', x_axis_label='', do_show=True, fmt='%g'): if not got_matplotlob: banner( 'ERROR: could not import matplotlib\n"plot_grid_numbers" FAILED.') return Nrows = len(rows_outL) Ncols = max([len(row) for row in rows_outL]) fig, axs = plt.subplots() plt.axes() font = FontProperties() font.set_size('large') font.set_family('fantasy') font.set_style('normal') for i in range(Nrows): rowL = rows_outL[i] x = Nrows - i - 1 for j in range(Ncols): if j < len(rowL): s = rowL[j] else: s = '*' # Rectangle( (x,y), width, height) rect = Rectangle((j, x), 0.9, 0.9, fc='r', alpha=0.5, edgecolor='black') plt.gca().add_patch(rect) t = plt.text(j + .45, x + .45, s, fontproperties=font, **alignment) plt.xlim(0, Ncols) plt.ylim(0, Nrows) plt.show()
def read_pickle_file(self, fname=None): # pragma: no cover """Reads data from pickle""" #raise ValueError( 'read_pickle_file is BROKEN... DO NOT USE' ) fname = self.make_pickle_filename(fname) if os.path.isfile(fname): pass # all good elif os.path.isfile(os.path.join(mdp_path, fname)): fname = os.path.join(mdp_path, fname) else: print('Pickle File NOT found:', fname) print('mdp_path:', mdp_path) s = '''Try running: "introrl_build_mdp" to create MDP Pickle Files. Type: introrl_build_mdp at the command line.''' banner(s, banner_char='', leftMargin=0, just='center') return False fileObject = open(fname, 'rb') readD = pickle.load(fileObject) self.name = readD['name'] self.define_statesD = readD['define_statesD'] self.info = readD['info'] self.layout = readD['layout'] if 'start_state_hash' in readD: self.start_state_hash = readD['start_state_hash'] if 'defined_limited_start_state_list' in readD: self.defined_limited_start_state_list = readD[ 'defined_limited_start_state_list'] self.define_env_states_actions( ) # use define_statesD to initialize data structures # ---------------------- fileObject.close() return True
def td0_prediction( policy, state_value_coll, all_start_states=False, do_summ_print=True, show_last_change=True, alpha=0.1, const_alpha=True, alpha_half_life=200, max_num_episodes=1000, min_num_episodes=10, max_abserr=0.001, gamma=1.0, result_list='abserr', true_valueD=None, value_snapshot_loopL=None ): # if input, save V(s) snapshot at iteration steps indicated """ ... GIVEN A POLICY TO EVALUATE apply TD(0), Temperal Difference(0) Prediction Terminates when abserr < max_abserr Assume that Q(s,a), action_value_coll, has been initialized prior to call. (Note tht the StateValues object has a reference to the Environment object) Assume environment attached to policy will have method "get_any_action_state_hash" in order to begin at any start state. action_value_coll WILL BE CHANGED... policy WILL NOT. """ resultL = [] # based on result_list, can be "rms" or "abserr" value_snapD = {} # index=loop counter, value=dict of {s_hash:Vs, ...} # ==> Note: the reference to Environment object as "state_value_coll.environment" Env = state_value_coll.environment alpha_obj = Alpha(alpha=alpha, const_alpha=const_alpha, half_life=alpha_half_life) if do_summ_print: print( '=============== TD(0) EVALUATING THE FOLLOWING POLICY ====================' ) policy.summ_print(verbosity=0, environment=Env, show_env_states=False, none_str='*') if all_start_states: s = 'Starting a Maximum of %i TD(0) All-Start-State Episodes\nGamma = %g'%\ (max_num_episodes, gamma) start_stateL = [s_hash for s_hash in Env.iter_all_action_states()] else: s = 'Starting a Maximum of %i TD(0) Episodes from state "%s"\nGamma = %g'%\ (max_num_episodes, str(Env.start_state_hash), gamma) start_stateL = [Env.start_state_hash] banner(s, banner_char='', leftMargin=0, just='center') num_episodes = 0 keep_looping = True # value-iteration stopping criteria progress_str = '' while (num_episodes <= max_num_episodes - 1) and keep_looping: keep_looping = False abserr = 0.0 # calculated below as part of termination criteria # policy evaluation for start_hash in start_stateL: num_episodes += 1 if num_episodes > max_num_episodes: break s_hash = start_hash a_desc = policy.get_single_action(s_hash) for _ in range(max_num_episodes): sn_hash, reward = Env.get_action_snext_reward( s_hash, a_desc) # prob-weighted choice state_value_coll.td0_update(s_hash=s_hash, alpha=alpha_obj(), gamma=gamma, sn_hash=sn_hash, reward=reward) if (sn_hash in Env.terminal_set) or (sn_hash is None): break # get ready for next step s_hash = sn_hash a_desc = policy.get_single_action(s_hash) if a_desc is None: print('a_desc is None for policy.get_single_action( "%s" ) ='%\ str(s_hash), a_desc) abserr = state_value_coll.get_biggest_action_state_err() if abserr > max_abserr: keep_looping = True if num_episodes < min_num_episodes: keep_looping = True # must loop for min_num_episodes at least pc_done = 100.0 * float(num_episodes) / float(max_num_episodes) out_str = '%i%%' % (5 * (int(pc_done / 5.0))) if out_str != progress_str: print(out_str, end=' ') progress_str = out_str if result_list == 'rms': resultL.append(state_value_coll.calc_rms_error(true_valueD)) if result_list == 'abserr': resultL.append(abserr) else: pass # don't save anything to resultL if value_snapshot_loopL is not None and num_episodes in value_snapshot_loopL: value_snapD[num_episodes] = state_value_coll.get_snapshot() if do_summ_print: s = '' if num_episodes >= max_num_episodes: s = ' (NOTE: STOPPED ON MAX-ITERATIONS)' print('Exited MC Every-Visit Policy Evaluation', s) print(' num_episodes =', num_episodes, ' (min limit=%i)' % min_num_episodes, ' (max limit=%i)' % max_num_episodes) print(' gamma =', gamma) print(' estimated err =', abserr) print(' Error limit =', max_abserr) state_value_coll.summ_print(show_last_change=show_last_change, show_states=True) return resultL, value_snapD
def sarsa_epsilon_greedy( environment, learn_tracker=None, # track progress of learning initial_Qsa=0.0, # init non-terminal_set of V(s) (terminal_set=0.0) initial_action_value_coll=None, # if input, use it. read_pickle_file='', save_pickle_file='', use_list_of_start_states=False, # use list OR single start state of environment. do_summ_print=True, show_last_change=True, fmt_Q='%g', fmt_R='%g', pcent_progress_print=10, show_banner = True, max_num_episodes=1000, min_num_episodes=10, max_abserr=0.001, gamma=0.9, iteration_prints=0, max_episode_steps=sys.maxsize, epsilon=0.1, const_epsilon=True, epsilon_half_life=200, alpha=0.1, const_alpha=True, alpha_half_life=200, N_episodes_wo_decay=0): """ ... GIVEN AN ENVIRONMENT ... apply SARSA Temporal Difference to find the OPTIMAL POLICY and STATE VALUES Returns: Policy and ActionValueColl objects Use Episode Discounted Returns to find V(s), State-Value Function Terminates when abserr < max_abserr Assume that V(s), action_value_coll, has been initialized prior to call. Assume environment attached to policy will have method "get_any_action_state_hash" in order to begin at any action state. CREATES BOTH policy AND action_value_coll OBJECTS. """ # create EpsilonGreedy, Alpha and ActionValueColl objects eg = EpsilonGreedy(epsilon=epsilon, const_epsilon=const_epsilon, half_life=epsilon_half_life, N_episodes_wo_decay=N_episodes_wo_decay) alpha_obj = Alpha( alpha=alpha, const_alpha=const_alpha, half_life=alpha_half_life ) if initial_action_value_coll is None: action_value_coll = ActionValueColl( environment, init_val=initial_Qsa ) else: action_value_coll = initial_action_value_coll #action_value_coll.summ_print() num_s_hash = len( environment.get_all_action_state_hashes() ) if read_pickle_file: action_value_coll.init_from_pickle_file( read_pickle_file ) if do_summ_print: print('================== EPSILON GREEDY DEFINED AS ========================') eg.summ_print() print('================== LEARNING RATE DEFINED AS ========================') alpha_obj.summ_print() if show_banner: s = 'Starting a Maximum of %i SARSA Epsilon Greedy Episodes'%max_num_episodes +\ '\nfor "%s" with Gamma = %g, Alpha = %g'%( environment.name, gamma, alpha_obj() ) banner(s, banner_char='', leftMargin=0, just='center') # Iterate over a list of known possible start states if use_list_of_start_states: loop_stateL = environment.limited_start_state_list() else: #loop_stateL = [ random.choice( environment.limited_start_state_list() ) ] loop_stateL = [ environment.start_state_hash ] if show_banner: print('======================= Iterating over Start States ==================================') print( loop_stateL ) print('======================================================================================') # set counter and flag episode_loop_counter = 0 keep_looping = True progress_str = '' while (episode_loop_counter<=max_num_episodes-1) and keep_looping : keep_looping = False abserr = 0.0 # calculated below as part of termination criteria Nterminal_episodes = set() # tracks if start_hash got to terminal_set or max_num_episodes for start_hash in loop_stateL: episode_loop_counter += 1 if episode_loop_counter > max_num_episodes: break if learn_tracker is not None: learn_tracker.add_new_episode() s_hash = start_hash a_desc = action_value_coll.get_best_eps_greedy_action( s_hash, epsgreedy_obj=eg ) for n_episode_steps in range( max_episode_steps ): # Begin an episode if a_desc is None: Nterminal_episodes.add( start_hash ) print('break for a_desc==None') break else: sn_hash, reward = environment.get_action_snext_reward( s_hash, a_desc ) if learn_tracker is not None: learn_tracker.add_sarsn_to_current_episode( s_hash, a_desc, reward, sn_hash) if sn_hash is None: Nterminal_episodes.add( start_hash ) print('break for sn_hash==None') break else: an_desc = action_value_coll.get_best_eps_greedy_action( sn_hash, epsgreedy_obj=eg ) action_value_coll.sarsa_update( s_hash=s_hash, a_desc=a_desc, alpha=alpha_obj(), gamma=gamma, sn_hash=sn_hash, an_desc=an_desc, reward=reward) if sn_hash in environment.terminal_set: Nterminal_episodes.add( start_hash ) if (n_episode_steps==0) and (num_s_hash>2): print('1st step break for sn_hash in terminal_set', sn_hash, ' s_hash=%s'%str(s_hash), ' a_desc=%s'%str(a_desc)) break s_hash = sn_hash a_desc = an_desc # increment episode counter on EpsilonGreedy and Alpha objects eg.inc_N_episodes() alpha_obj.inc_N_episodes() abserr = action_value_coll.get_biggest_action_state_err() if abserr > max_abserr: keep_looping = True if episode_loop_counter < min_num_episodes: keep_looping = True # must loop for min_num_episodes at least pc_done = 100.0 * float(episode_loop_counter) / float(max_num_episodes) if pcent_progress_print > 0: out_str = '%3i%%'%( pcent_progress_print*(int(pc_done/float(pcent_progress_print)) ) ) else: out_str = progress_str if out_str != progress_str: #score = environment.get_policy_score( policy=policy, start_state_hash=None, step_limit=1000) #print(out_str, ' score=%s'%str(score), ' = (r_sum, n_steps, msg)', end=' ') print(out_str, end=' ') print( 'Nterminal episodes =', len(Nterminal_episodes),' of ', len(loop_stateL)) progress_str = out_str #print() policy = action_value_coll.get_policy() if do_summ_print: s = '' if episode_loop_counter >= max_num_episodes: s = ' (NOTE: STOPPED ON MAX-ITERATIONS)' print( 'Exited Epsilon Greedy, TD(0) Value Iteration', s ) print( ' # episodes =', episode_loop_counter, ' (min limit=%i)'%min_num_episodes, ' (max limit=%i)'%max_num_episodes ) print( ' gamma =', gamma ) print( ' estimated err =', abserr ) print( ' Error limit =', max_abserr ) print( 'Nterminal episodes =', len(Nterminal_episodes),' of ', len(loop_stateL)) action_value_coll.summ_print(show_last_change=show_last_change, fmt_Q=fmt_Q ) policy.summ_print( environment=environment, verbosity=0, show_env_states=False ) try: # sims may not have a layout_print environment.layout_print( vname='reward', fmt=fmt_R, show_env_states=False, none_str='*') except: pass print('================== EPSILON GREEDY DEFINED AS ========================') eg.summ_print() print('================== LEARNING RATE DEFINED AS ========================') alpha_obj.summ_print() if save_pickle_file: policy.save_to_pickle_file( save_pickle_file ) action_value_coll.save_to_pickle_file( save_pickle_file ) return policy, action_value_coll #, steps_per_episodeL, reward_sum_per_episodeL
def mc_every_visit_prediction( policy, state_value_coll, all_start_states=False, do_summ_print=True, show_last_change=True, show_banner=True, max_episode_steps=10000, alpha=0.1, const_alpha=True, alpha_half_life=200, max_num_episodes=1000, min_num_episodes=10, max_abserr=0.001, gamma=0.9, result_list='abserr', true_valueD=None, value_snapshot_loopL=None ): # if input, save V(s) snapshot at iteration steps indicated """ ... GIVEN A POLICY TO EVALUATE apply Monte Carlo Every Visit Prediction Use Episode Discounted Returns to find V(s), State-Value Function Terminates when abserr < max_abserr Assume that V(s), state_value_coll, has been initialized prior to call. (Note tht the StateValues object has a reference to the Environment object) Assume environment attached to policy will have method "get_any_action_state_hash" in order to begin at any start state. state_value_coll WILL BE CHANGED... policy WILL NOT. """ resultL = [] # based on result_list, can be "rms" or "abserr" value_snapD = {} # index=loop counter, value=dict of {s_hash:Vs, ...} # ==> Note: the reference to Environment object as "state_value_coll.environment" Env = state_value_coll.environment episode = Episode(Env.name + ' Episode') alpha_obj = Alpha(alpha=alpha, const_alpha=const_alpha, half_life=alpha_half_life) if do_summ_print: print( '=============== EVALUATING THE FOLLOWING POLICY ====================' ) policy.summ_print(verbosity=0, environment=Env, show_env_states=False, none_str='*') if all_start_states: s = 'Starting a Maximum of %i Monte Carlo All-Start-State Iterations\nGamma = %g' % ( max_num_episodes, gamma) start_stateL = [s_hash for s_hash in Env.iter_all_action_states()] else: s = 'Starting a Maximum of %i Monte Carlo Iterations from state "%s"\nGamma = %g' % ( max_num_episodes, str(Env.start_state_hash), gamma) start_stateL = [Env.start_state_hash] if show_banner: banner(s, banner_char='', leftMargin=0, just='center') num_episodes = 0 keep_looping = True # value-iteration stopping criteria progress_str = '' while (num_episodes <= max_num_episodes - 1) and keep_looping: keep_looping = False abserr = 0.0 # calculated below as part of termination criteria # policy evaluation random.shuffle(start_stateL) for start_hash in start_stateL: # break from inner loop if max_num_episodes is hit. if num_episodes >= max_num_episodes: break make_episode(start_hash, policy, Env, Env.terminal_set, episode=episode, max_steps=max_episode_steps, eps_greedy=None) num_episodes += 1 for dr in episode.get_rev_discounted_returns(gamma=gamma): (s_hash, a_desc, reward, sn_hash, G) = dr state_value_coll.mc_update(s_hash, alpha_obj(), G) abserr = state_value_coll.get_biggest_action_state_err() if abserr > max_abserr: keep_looping = True if num_episodes < min_num_episodes: keep_looping = True # must loop for min_num_episodes at least pc_done = 100.0 * float(num_episodes) / float(max_num_episodes) out_str = '%i%%' % (5 * (int(pc_done / 5.0))) if out_str != progress_str: print(out_str, end=' ') progress_str = out_str if result_list == 'rms': resultL.append(state_value_coll.calc_rms_error(true_valueD)) if result_list == 'abserr': resultL.append(abserr) else: pass # don't save anything to resultL if value_snapshot_loopL is not None and num_episodes in value_snapshot_loopL: value_snapD[num_episodes] = state_value_coll.get_snapshot() if do_summ_print: s = '' if num_episodes >= max_num_episodes: s = ' (NOTE: STOPPED ON MAX-ITERATIONS)' print('Exited MC Every-Visit Policy Evaluation', s) print(' num episodes =', num_episodes, ' (min limit=%i)' % min_num_episodes, ' (max limit=%i)' % max_num_episodes) print(' gamma =', gamma) print(' estimated err =', abserr) print(' Error limit =', max_abserr) state_value_coll.summ_print(show_last_change=show_last_change, show_states=True) return resultL, value_snapD
def __init__( self, environment, learn_tracker=None, # track progress of learning sa_linear_function=None, # if input, use it. update_type='sarsa', # can be 'sarsa', 'qlearn' read_pickle_file='', save_pickle_file='', do_summ_print=True, show_last_change=True, pcent_progress_print=10, show_banner=True, gamma=0.9, iteration_prints=0, max_episode_steps=sys.maxsize, epsilon=0.1, # can be constant or EpsilonGreedy object alpha=0.1): # can be constant or Alpha object """ ... GIVEN AN ENVIRONMENT ... Use basic SARSA or Qlearning algorithm to solve for linear approximation of STATE-ACTION VALUES, Q(s,a) Each action is forced to be a DETERMINISTIC action leading to one state and reward. (If the next state or reward changes, only the new values will be considered) attribute: self.action_value_linfunc is the linear approximation, Q(s,a) object A DETERMINISTIC policy can be created externally from the self.action_value_linfunc attribute. """ self.environment = environment self.learn_tracker = learn_tracker self.save_pickle_file = save_pickle_file self.do_summ_print = do_summ_print self.show_last_change = show_last_change self.pcent_progress_print = pcent_progress_print self.gamma = gamma self.iteration_prints = iteration_prints self.max_episode_steps = max_episode_steps self.num_episodes = 0 self.num_updates = 0 # if input epsilon is a float, use it to create an EpsilonGreedy object if type(epsilon) == type(0.1): self.epsilon_obj = EpsilonGreedy(epsilon=epsilon, const_epsilon=True) else: self.epsilon_obj = epsilon # if input alpha is a float, use it to create an Alpha object if type(alpha) == type(0.1): self.alpha_obj = Alpha(alpha=alpha, const_alpha=True) else: self.alpha_obj = alpha # create the action_value_linfunc for the environment. self.action_value_linfunc = sa_linear_function self.update_type = update_type if read_pickle_file: self.action_value_linfunc.init_from_pickle_file(read_pickle_file) if do_summ_print: print( '================== EPSILON GREEDY DEFINED AS ========================' ) self.epsilon_obj.summ_print() print( '================== LEARNING RATE DEFINED AS ========================' ) self.alpha_obj.summ_print() if show_banner: s = 'Starting a Maximum of %i %s Semi-Gradient Epsilon Greedy Steps/Episode'%(self.max_episode_steps, update_type.upper()) +\ '\nfor "%s" with Gamma = %g, Alpha = %g'%( environment.name, self.gamma, self.alpha_obj() ) banner(s, banner_char='', leftMargin=0, just='center')
def mc_first_visit_prediction( policy, state_value_ave, first_visit=True, do_summ_print=True, showRunningAve=False, max_episode_steps=10000, max_num_episodes=1000, min_num_episodes=10, max_abserr=0.001, gamma=0.9): """ ... GIVEN A POLICY TO EVALUATE apply Monte Carlo First Visit Prediction Use Episode Discounted Returns to find V(s), State-Value Function Terminates when abserr < max_abserr Assume that V(s), state_value_ave, has been initialized prior to call. (Note tht the StateValues object has a reference to the Environment object) Assume environment attached to policy will have method "get_any_action_state_hash" in order to begin at any start state. state_value_ave WILL BE CHANGED... policy WILL NOT. """ # ==> Note: the reference to Environment object as "state_value_ave.environment" Env = state_value_ave.environment episode = Episode( Env.name + ' Episode' ) if do_summ_print: print('=============== EVALUATING THE FOLLOWING POLICY ====================') policy.summ_print( verbosity=0, environment=Env, show_env_states=False, none_str='*') s = 'Starting a Maximum of %i Monte Carlo All-Start-State Iterations\nGamma = %g'%(max_num_episodes, gamma) banner(s, banner_char='', leftMargin=0, just='center') keep_looping = True # value-iteration stopping criteria progress_str = '' num_episodes = 0 while (num_episodes<=max_num_episodes-1) and keep_looping: keep_looping = False abserr = 0.0 # calculated below as part of termination criteria # policy evaluation for start_hash in Env.iter_all_action_states( randomize=True ): # break from inner loop if max_num_episodes is hit. if num_episodes >= max_num_episodes: break make_episode(start_hash, policy, Env, Env.terminal_set, episode=episode, max_steps=max_episode_steps, eps_greedy=None) num_episodes += 1 for dr in episode.get_rev_discounted_returns( gamma=gamma, first_visit=first_visit, visit_type='S'): (s_hash, a_desc, reward, sn_hash, G) = dr state_value_ave.add_val( s_hash, G) abserr = state_value_ave.get_biggest_action_state_err() if abserr > max_abserr: keep_looping = True if num_episodes < min_num_episodes: keep_looping = True # must loop for min_num_episodes at least pc_done = 100.0 * float(num_episodes) / float(max_num_episodes) out_str = '%i%%'%( 5*(int(pc_done/5.0) ) ) if out_str != progress_str: print(out_str, end=' ') progress_str = out_str if do_summ_print: s = '' if num_episodes >= max_num_episodes: s = ' (NOTE: STOPPED ON MAX-ITERATIONS)' print( 'Exited MC First-Visit Policy Evaluation', s ) print( ' num episodes =', num_episodes, ' (min limit=%i)'%min_num_episodes, ' (max limit=%i)'%max_num_episodes ) print( ' gamma =', gamma ) print( ' estimated err =', abserr ) print( ' Error limit =', max_abserr ) state_value_ave.summ_print( showRunningAve=showRunningAve, show_states=True) return abserr
def mc_exploring_starts(environment, initial_policy='default', read_pickle_file='', save_pickle_file='', first_visit=True, do_summ_print=True, showRunningAve=False, fmt_Q='%g', fmt_R='%g', show_initial_policy=True, max_num_episodes=1000, min_num_episodes=10, max_abserr=0.001, gamma=0.9, max_episode_steps=10000, iteration_prints=0): """ ... GIVEN AN ENVIRONMENT ... apply Monte Carlo Exploring Starts to find the OPTIMAL POLICY initial_policy can be 'default', 'random', policy_dictionary, Policy object Returns: Policy and ActionValueRunAveColl objects Use Episode Discounted Returns to find Q(s,a), Action-Value Function Terminates when abserr < max_abserr Assume that Q(s,a), action_value_ave, has been initialized prior to call. Assume environment attached to policy will have method "get_any_action_state_hash" in order to begin at any action state. CREATES BOTH policy AND action_value OBJECTS. """ # create Policy and ActionValueRunAveColl objects policy = Policy(environment=environment) if initial_policy == 'default': print('Initializing Policy to "default" in mc_exploring_starts') policy.learn_a_legal_action_from_env(env=environment) policy.set_policy_from_piD(environment.get_default_policy_desc_dict()) elif initial_policy == 'random': print('Initializing Policy to "random" in mc_exploring_starts') policy.intialize_policy_to_random(env=environment) elif isinstance(initial_policy, Policy): policy = initial_policy else: print('Initializing Policy to "custom policy" in mc_exploring_starts') policy.learn_a_legal_action_from_env(env=environment) policy.set_policy_from_piD(initial_policy) action_value_ave = ActionValueRunAveColl(environment) action_value_ave.init_Qsa_to_zero( ) # Terminal states w/o an action are NOT included #action_value_ave.summ_print() if read_pickle_file: policy.init_from_pickle_file(read_pickle_file) action_value_ave.init_from_pickle_file(read_pickle_file) if do_summ_print: if show_initial_policy: print( '=============== STARTING WITH THE INITIAL POLICY ====================' ) policy.summ_print(verbosity=0, environment=environment, show_env_states=False, none_str='*') s = 'Starting a Maximum of %i Monte Carlo Exploring Start Episodes\nfor "%s" with Gamma = %g'%\ (max_num_episodes, environment.name, gamma) banner(s, banner_char='', leftMargin=0, just='center') # create an Episode object for getting returns episode = Episode(environment.name + ' Episode') # set counter and flag num_episodes = 0 keep_looping = True progress_str = '' while (num_episodes <= max_num_episodes - 1) and keep_looping: keep_looping = False abserr = 0.0 # calculated below as part of termination criteria for start_hash in environment.iter_all_action_states(randomize=True): a_descL = environment.get_state_legal_action_list(start_hash) # randomize action order random.shuffle(a_descL) # try every initial action for each start_hash for a_desc in a_descL: # break from inner loop if max_num_episodes is hit. if num_episodes >= max_num_episodes: break make_episode(start_hash, policy, environment, environment.terminal_set, episode=episode, first_a_desc=a_desc, max_steps=max_episode_steps, eps_greedy=None) num_episodes += 1 for dr in episode.get_rev_discounted_returns( gamma=gamma, first_visit=first_visit, visit_type='SA'): # look at each step from episode and calc average Q(s,a) (s, a, r, sn, G) = dr action_value_ave.add_val(s, a, G) aL = environment.get_state_legal_action_list(s) if aL: best_a_desc, best_a_val = aL[0], float('-inf') bestL = [best_a_desc] for a in aL: q = action_value_ave.get_ave(s, a) if q > best_a_val: best_a_desc, best_a_val = a, q bestL = [a] elif q == best_a_val: bestL.append(a) best_a_desc = random.choice(bestL) policy.set_sole_action(s, best_a_desc) abserr = action_value_ave.get_biggest_action_state_err() if abserr > max_abserr: keep_looping = True if num_episodes < min_num_episodes: keep_looping = True # must loop for min_num_episodes at least pc_done = 100.0 * float(num_episodes) / float(max_num_episodes) out_str = '%3i%%' % (5 * (int(pc_done / 5.0))) if out_str != progress_str: score = environment.get_policy_score(policy=policy, start_state_hash=None, step_limit=1000) print(out_str, ' score=%s' % str(score), ' = (r_sum, n_steps, msg)', ' estimated err =', abserr) progress_str = out_str if do_summ_print: s = '' if num_episodes >= max_num_episodes: s = ' (NOTE: STOPPED ON MAX-ITERATIONS)' print('Exited MC First-Visit Value Iteration', s) print(' num episodes =', num_episodes, ' (min limit=%i)' % min_num_episodes, ' (max limit=%i)' % max_num_episodes) print(' gamma =', gamma) print(' estimated err =', abserr) print(' Error limit =', max_abserr) action_value_ave.summ_print(showRunningAve=showRunningAve, fmt_Q=fmt_Q) policy.summ_print(environment=environment, verbosity=0, show_env_states=False) try: # sims may not have a layout_print environment.layout_print(vname='reward', fmt=fmt_R, show_env_states=False, none_str='*') except: pass if save_pickle_file: policy.save_to_pickle_file(save_pickle_file) action_value_ave.save_to_pickle_file(save_pickle_file) return policy, action_value_ave
def __init__( self, environment, learn_tracker=None, # track progress of learning initial_Qsa=0.0, # init non-terminal_set of V(s) (terminal_set=0.0) initial_action_value_coll=None, # if input, use it. read_pickle_file='', save_pickle_file='', do_summ_print=True, show_last_change=True, pcent_progress_print=10, show_banner=True, gamma=0.9, iteration_prints=0, max_episode_steps=sys.maxsize, epsilon=0.1, # can be constant or EpsilonGreedy object alpha=0.1): # can be constant or Alpha object """ ... GIVEN AN ENVIRONMENT ... Use basic Dyna-Q algorithm to solve for STATE-ACTION VALUES, Q(s,a) Each action is forced to be a DETERMINISTIC action leading to one state and reward. (If the next state or reward changes, only the new values will be considered) attribute: self.action_value_coll is the ActionValueColl, Q(s,a) object A DETERMINISTIC policy can be created externally from the self.action_value_coll attribute. """ self.environment = environment self.learn_tracker = learn_tracker self.save_pickle_file = save_pickle_file self.do_summ_print = do_summ_print self.show_last_change = show_last_change self.pcent_progress_print = pcent_progress_print self.gamma = gamma self.iteration_prints = iteration_prints self.max_episode_steps = max_episode_steps self.num_episodes = 0 self.num_updates = 0 # if input epsilon is a float, use it to create an EpsilonGreedy object if type(epsilon) == type(0.1): self.epsilon_obj = EpsilonGreedy(epsilon=epsilon, const_epsilon=True) else: self.epsilon_obj = epsilon # if input alpha is a float, use it to create an Alpha object if type(alpha) == type(0.1): self.alpha_obj = Alpha(alpha=alpha, const_alpha=True) else: self.alpha_obj = alpha # create the action_value_coll for the environment. if initial_action_value_coll is None: self.action_value_coll = ActionValueColl(environment, init_val=initial_Qsa) else: self.action_value_coll = initial_action_value_coll if read_pickle_file: self.action_value_coll.init_from_pickle_file(read_pickle_file) # initialize the model that will build from experience # do not build full model description on Model init, states not visited # by the RL portion will have no returns values. self.model = Model(environment, build_initial_model=False) #for s_hash, aD in self.action_value_coll.QsaD.items(): # for a_desc, Q in aD.items(): # self.model.add_action( s_hash, a_desc ) if do_summ_print: print( '================== EPSILON GREEDY DEFINED AS ========================' ) self.epsilon_obj.summ_print() print( '================== LEARNING RATE DEFINED AS ========================' ) self.alpha_obj.summ_print() if show_banner: s = 'Starting a Maximum of %i Dyna-Q Epsilon Greedy Steps/Episode'%self.max_episode_steps +\ '\nfor "%s" with Gamma = %g, Alpha = %g'%( environment.name, self.gamma, self.alpha_obj() ) banner(s, banner_char='', leftMargin=0, just='center')