示例#1
0
def plot_grid_numbers(rows_outL,
                      header='',
                      x_axis_label='',
                      do_show=True,
                      fmt='%g'):

    if not got_matplotlob:
        banner(
            'ERROR: could not import matplotlib\n"plot_grid_numbers" FAILED.')
        return

    Nrows = len(rows_outL)
    Ncols = max([len(row) for row in rows_outL])

    fig, axs = plt.subplots()
    plt.axes()

    font = FontProperties()
    font.set_size('large')
    font.set_family('fantasy')
    font.set_style('normal')

    for i in range(Nrows):
        rowL = rows_outL[i]
        x = Nrows - i - 1
        for j in range(Ncols):
            if j < len(rowL):
                s = rowL[j]
            else:
                s = '*'
            #      Rectangle(  (x,y),    width,   height)
            rect = Rectangle((j, x),
                             0.9,
                             0.9,
                             fc='r',
                             alpha=0.5,
                             edgecolor='black')
            plt.gca().add_patch(rect)

            t = plt.text(j + .45, x + .45, s, fontproperties=font, **alignment)

    plt.xlim(0, Ncols)
    plt.ylim(0, Nrows)

    plt.show()
示例#2
0
    def read_pickle_file(self, fname=None):  # pragma: no cover
        """Reads data from pickle"""

        #raise ValueError( 'read_pickle_file is BROKEN... DO NOT USE' )

        fname = self.make_pickle_filename(fname)
        if os.path.isfile(fname):
            pass  # all good
        elif os.path.isfile(os.path.join(mdp_path, fname)):
            fname = os.path.join(mdp_path, fname)
        else:
            print('Pickle File NOT found:', fname)
            print('mdp_path:', mdp_path)

            s = '''Try running: "introrl_build_mdp" to create MDP Pickle Files.
Type: introrl_build_mdp
at the command line.'''
            banner(s, banner_char='', leftMargin=0, just='center')

            return False

        fileObject = open(fname, 'rb')

        readD = pickle.load(fileObject)

        self.name = readD['name']
        self.define_statesD = readD['define_statesD']
        self.info = readD['info']
        self.layout = readD['layout']

        if 'start_state_hash' in readD:
            self.start_state_hash = readD['start_state_hash']
        if 'defined_limited_start_state_list' in readD:
            self.defined_limited_start_state_list = readD[
                'defined_limited_start_state_list']

        self.define_env_states_actions(
        )  # use define_statesD to initialize data structures
        # ----------------------

        fileObject.close()

        return True
示例#3
0
def td0_prediction(
    policy,
    state_value_coll,
    all_start_states=False,
    do_summ_print=True,
    show_last_change=True,
    alpha=0.1,
    const_alpha=True,
    alpha_half_life=200,
    max_num_episodes=1000,
    min_num_episodes=10,
    max_abserr=0.001,
    gamma=1.0,
    result_list='abserr',
    true_valueD=None,
    value_snapshot_loopL=None
):  # if input, save V(s) snapshot at iteration steps indicated
    """
    ... GIVEN A POLICY TO EVALUATE  apply TD(0), Temperal Difference(0) Prediction
    
    Terminates when abserr < max_abserr
    
    Assume that Q(s,a), action_value_coll, has been initialized prior to call.
    (Note tht the StateValues object has a reference to the Environment object)
    
    Assume environment attached to policy will have method "get_any_action_state_hash"
    in order to begin at any start state.
    
    action_value_coll WILL BE CHANGED... policy WILL NOT.
    """

    resultL = []  # based on result_list, can be "rms" or "abserr"
    value_snapD = {}  # index=loop counter, value=dict of {s_hash:Vs, ...}

    # ==> Note: the reference to Environment object as "state_value_coll.environment"
    Env = state_value_coll.environment

    alpha_obj = Alpha(alpha=alpha,
                      const_alpha=const_alpha,
                      half_life=alpha_half_life)

    if do_summ_print:
        print(
            '=============== TD(0) EVALUATING THE FOLLOWING POLICY ===================='
        )
        policy.summ_print(verbosity=0,
                          environment=Env,
                          show_env_states=False,
                          none_str='*')

    if all_start_states:
        s = 'Starting a Maximum of %i TD(0) All-Start-State Episodes\nGamma = %g'%\
            (max_num_episodes, gamma)
        start_stateL = [s_hash for s_hash in Env.iter_all_action_states()]
    else:
        s = 'Starting a Maximum of %i TD(0) Episodes from state "%s"\nGamma = %g'%\
            (max_num_episodes, str(Env.start_state_hash), gamma)
        start_stateL = [Env.start_state_hash]

    banner(s, banner_char='', leftMargin=0, just='center')

    num_episodes = 0
    keep_looping = True

    # value-iteration stopping criteria

    progress_str = ''
    while (num_episodes <= max_num_episodes - 1) and keep_looping:

        keep_looping = False
        abserr = 0.0  # calculated below as part of termination criteria

        # policy evaluation
        for start_hash in start_stateL:
            num_episodes += 1
            if num_episodes > max_num_episodes:
                break

            s_hash = start_hash
            a_desc = policy.get_single_action(s_hash)
            for _ in range(max_num_episodes):

                sn_hash, reward = Env.get_action_snext_reward(
                    s_hash, a_desc)  # prob-weighted choice

                state_value_coll.td0_update(s_hash=s_hash,
                                            alpha=alpha_obj(),
                                            gamma=gamma,
                                            sn_hash=sn_hash,
                                            reward=reward)

                if (sn_hash in Env.terminal_set) or (sn_hash is None):
                    break

                # get ready for next step
                s_hash = sn_hash

                a_desc = policy.get_single_action(s_hash)
                if a_desc is None:
                    print('a_desc is None for policy.get_single_action( "%s" ) ='%\
                          str(s_hash), a_desc)

        abserr = state_value_coll.get_biggest_action_state_err()
        if abserr > max_abserr:
            keep_looping = True

        if num_episodes < min_num_episodes:
            keep_looping = True  # must loop for min_num_episodes at least

        pc_done = 100.0 * float(num_episodes) / float(max_num_episodes)
        out_str = '%i%%' % (5 * (int(pc_done / 5.0)))
        if out_str != progress_str:
            print(out_str, end=' ')
            progress_str = out_str

        if result_list == 'rms':
            resultL.append(state_value_coll.calc_rms_error(true_valueD))
        if result_list == 'abserr':
            resultL.append(abserr)
        else:
            pass  # don't save anything to resultL

    if value_snapshot_loopL is not None and num_episodes in value_snapshot_loopL:
        value_snapD[num_episodes] = state_value_coll.get_snapshot()

    if do_summ_print:
        s = ''
        if num_episodes >= max_num_episodes:
            s = '   (NOTE: STOPPED ON MAX-ITERATIONS)'

        print('Exited MC Every-Visit Policy Evaluation', s)
        print('   num_episodes   =', num_episodes,
              ' (min limit=%i)' % min_num_episodes,
              ' (max limit=%i)' % max_num_episodes)
        print('   gamma          =', gamma)
        print('   estimated err  =', abserr)
        print('   Error limit    =', max_abserr)

        state_value_coll.summ_print(show_last_change=show_last_change,
                                    show_states=True)

    return resultL, value_snapD
示例#4
0
def sarsa_epsilon_greedy( environment,  learn_tracker=None, # track progress of learning
                          initial_Qsa=0.0, # init non-terminal_set of V(s) (terminal_set=0.0)
                          initial_action_value_coll=None, # if input, use it.
                          read_pickle_file='', 
                          save_pickle_file='',
                          use_list_of_start_states=False, # use list OR single start state of environment.
                          do_summ_print=True, show_last_change=True, fmt_Q='%g', fmt_R='%g',
                          pcent_progress_print=10,
                          show_banner = True,
                          max_num_episodes=1000, min_num_episodes=10, max_abserr=0.001, 
                          gamma=0.9,
                          iteration_prints=0,
                          max_episode_steps=sys.maxsize,
                          epsilon=0.1, const_epsilon=True, epsilon_half_life=200,
                          alpha=0.1, const_alpha=True, alpha_half_life=200,
                          N_episodes_wo_decay=0):
    """
    ... GIVEN AN ENVIRONMENT ... 
    apply SARSA Temporal Difference to find the OPTIMAL POLICY and STATE VALUES
    
    Returns: Policy and ActionValueColl objects
    
    Use Episode Discounted Returns to find V(s), State-Value Function
    
    Terminates when abserr < max_abserr
    
    Assume that V(s), action_value_coll, has been initialized prior to call.
    
    Assume environment attached to policy will have method "get_any_action_state_hash"
    in order to begin at any action state.
    
    CREATES BOTH policy AND action_value_coll OBJECTS.
    """
    
    # create EpsilonGreedy, Alpha and ActionValueColl objects
    eg = EpsilonGreedy(epsilon=epsilon, const_epsilon=const_epsilon, half_life=epsilon_half_life,
                       N_episodes_wo_decay=N_episodes_wo_decay)

    
    alpha_obj = Alpha( alpha=alpha, const_alpha=const_alpha, half_life=alpha_half_life )


    if initial_action_value_coll is None:
        action_value_coll = ActionValueColl( environment, init_val=initial_Qsa )
    else:
        action_value_coll = initial_action_value_coll
    #action_value_coll.summ_print()
    num_s_hash = len( environment.get_all_action_state_hashes() )

    if read_pickle_file:
        action_value_coll.init_from_pickle_file( read_pickle_file )
    
    if do_summ_print:
        print('================== EPSILON GREEDY DEFINED AS ========================')
        eg.summ_print()
        
        print('================== LEARNING RATE DEFINED AS ========================')
        alpha_obj.summ_print()
    
    if show_banner:
        s = 'Starting a Maximum of %i SARSA Epsilon Greedy Episodes'%max_num_episodes +\
            '\nfor "%s" with Gamma = %g, Alpha = %g'%( environment.name, gamma, alpha_obj() )
        banner(s, banner_char='', leftMargin=0, just='center')
        
    # Iterate over a list of known possible start states
    if use_list_of_start_states:
        loop_stateL = environment.limited_start_state_list()
    else:
        #loop_stateL = [ random.choice( environment.limited_start_state_list() ) ]
        loop_stateL = [ environment.start_state_hash ]
        
    if show_banner:
        print('======================= Iterating over Start States ==================================')
        print( loop_stateL )
        print('======================================================================================')

        
    # set counter and flag
    episode_loop_counter = 0
    keep_looping = True
    
    progress_str = ''
    while (episode_loop_counter<=max_num_episodes-1) and keep_looping :
            
        keep_looping = False
        abserr = 0.0 # calculated below as part of termination criteria
        Nterminal_episodes = set() # tracks if start_hash got to terminal_set or max_num_episodes
        
        for start_hash in loop_stateL:
            episode_loop_counter += 1
            if episode_loop_counter > max_num_episodes:
                break
            
            if learn_tracker is not None:
                learn_tracker.add_new_episode()
            
            s_hash = start_hash
            a_desc = action_value_coll.get_best_eps_greedy_action( s_hash, epsgreedy_obj=eg )
            
            for n_episode_steps in range( max_episode_steps ):
                
                # Begin an episode
                if a_desc is None:
                    Nterminal_episodes.add( start_hash )
                    print('break for a_desc==None')
                    break
                else:
                    sn_hash, reward = environment.get_action_snext_reward( s_hash, a_desc )
                    if learn_tracker is not None:
                        learn_tracker.add_sarsn_to_current_episode( s_hash, a_desc, 
                                                                    reward, sn_hash)
                    
                    if sn_hash is None:
                        Nterminal_episodes.add( start_hash )
                        print('break for sn_hash==None')
                        break
                    else:
                        an_desc = action_value_coll.get_best_eps_greedy_action( sn_hash, 
                                                                                epsgreedy_obj=eg )
            
                        action_value_coll.sarsa_update( s_hash=s_hash, a_desc=a_desc, 
                                                        alpha=alpha_obj(), gamma=gamma, 
                                                        sn_hash=sn_hash, an_desc=an_desc, 
                                                        reward=reward)
                        
                        if sn_hash in environment.terminal_set:
                            Nterminal_episodes.add( start_hash )
                            if (n_episode_steps==0) and (num_s_hash>2):
                                print('1st step break for sn_hash in terminal_set', sn_hash, 
                                      ' s_hash=%s'%str(s_hash), ' a_desc=%s'%str(a_desc))
                            break
                        s_hash = sn_hash
                        a_desc = an_desc
        
        # increment episode counter on EpsilonGreedy and Alpha objects
        eg.inc_N_episodes()
        alpha_obj.inc_N_episodes()
                
        abserr = action_value_coll.get_biggest_action_state_err()
        if abserr > max_abserr:
            keep_looping = True
            
        if episode_loop_counter < min_num_episodes:
            keep_looping = True # must loop for min_num_episodes at least
            
        pc_done = 100.0 * float(episode_loop_counter) / float(max_num_episodes)
        
        if pcent_progress_print > 0:
            out_str = '%3i%%'%( pcent_progress_print*(int(pc_done/float(pcent_progress_print)) ) )
        else:
            out_str = progress_str
        
        if out_str != progress_str:
            #score = environment.get_policy_score( policy=policy, start_state_hash=None, step_limit=1000)
            #print(out_str, ' score=%s'%str(score), ' = (r_sum, n_steps, msg)', end=' ')
            
            print(out_str, end=' ')
            print( 'Nterminal episodes =', len(Nterminal_episodes),' of ', len(loop_stateL))
            progress_str = out_str
    #print()
    
    policy = action_value_coll.get_policy()
    
    if do_summ_print:
        s = ''
        if episode_loop_counter >= max_num_episodes:
            s = '   (NOTE: STOPPED ON MAX-ITERATIONS)'

        print( 'Exited Epsilon Greedy, TD(0) Value Iteration', s )
        print( '   # episodes      =', episode_loop_counter, ' (min limit=%i)'%min_num_episodes, ' (max limit=%i)'%max_num_episodes )
        print( '   gamma           =', gamma )
        print( '   estimated err   =', abserr )
        print( '   Error limit     =', max_abserr )
        print( 'Nterminal episodes =', len(Nterminal_episodes),' of ', len(loop_stateL))
    
        action_value_coll.summ_print(show_last_change=show_last_change, fmt_Q=fmt_Q )
        policy.summ_print(  environment=environment, verbosity=0, show_env_states=False  )
        
        try: # sims may not have a layout_print
            environment.layout_print( vname='reward', fmt=fmt_R, show_env_states=False, none_str='*')
        except:
            pass

        print('================== EPSILON GREEDY DEFINED AS ========================')
        eg.summ_print()
        
        print('================== LEARNING RATE DEFINED AS ========================')
        alpha_obj.summ_print()

    if save_pickle_file:
        policy.save_to_pickle_file( save_pickle_file )
        action_value_coll.save_to_pickle_file( save_pickle_file )
        
    return policy, action_value_coll #, steps_per_episodeL, reward_sum_per_episodeL
示例#5
0
def mc_every_visit_prediction(
    policy,
    state_value_coll,
    all_start_states=False,
    do_summ_print=True,
    show_last_change=True,
    show_banner=True,
    max_episode_steps=10000,
    alpha=0.1,
    const_alpha=True,
    alpha_half_life=200,
    max_num_episodes=1000,
    min_num_episodes=10,
    max_abserr=0.001,
    gamma=0.9,
    result_list='abserr',
    true_valueD=None,
    value_snapshot_loopL=None
):  # if input, save V(s) snapshot at iteration steps indicated
    """
    ... GIVEN A POLICY TO EVALUATE  apply Monte Carlo Every Visit Prediction
    
    Use Episode Discounted Returns to find V(s), State-Value Function
    
    Terminates when abserr < max_abserr
    
    Assume that V(s), state_value_coll, has been initialized prior to call.
    (Note tht the StateValues object has a reference to the Environment object)
    
    Assume environment attached to policy will have method "get_any_action_state_hash"
    in order to begin at any start state.
    
    state_value_coll WILL BE CHANGED... policy WILL NOT.
    """

    resultL = []  # based on result_list, can be "rms" or "abserr"
    value_snapD = {}  # index=loop counter, value=dict of {s_hash:Vs, ...}

    # ==> Note: the reference to Environment object as "state_value_coll.environment"
    Env = state_value_coll.environment
    episode = Episode(Env.name + ' Episode')

    alpha_obj = Alpha(alpha=alpha,
                      const_alpha=const_alpha,
                      half_life=alpha_half_life)

    if do_summ_print:
        print(
            '=============== EVALUATING THE FOLLOWING POLICY ===================='
        )
        policy.summ_print(verbosity=0,
                          environment=Env,
                          show_env_states=False,
                          none_str='*')

    if all_start_states:
        s = 'Starting a Maximum of %i Monte Carlo All-Start-State Iterations\nGamma = %g' % (
            max_num_episodes, gamma)
        start_stateL = [s_hash for s_hash in Env.iter_all_action_states()]
    else:
        s = 'Starting a Maximum of %i Monte Carlo Iterations from state "%s"\nGamma = %g' % (
            max_num_episodes, str(Env.start_state_hash), gamma)
        start_stateL = [Env.start_state_hash]

    if show_banner:
        banner(s, banner_char='', leftMargin=0, just='center')

    num_episodes = 0
    keep_looping = True

    # value-iteration stopping criteria

    progress_str = ''
    while (num_episodes <= max_num_episodes - 1) and keep_looping:

        keep_looping = False
        abserr = 0.0  # calculated below as part of termination criteria

        # policy evaluation
        random.shuffle(start_stateL)
        for start_hash in start_stateL:

            # break from inner loop if max_num_episodes is hit.
            if num_episodes >= max_num_episodes:
                break

            make_episode(start_hash,
                         policy,
                         Env,
                         Env.terminal_set,
                         episode=episode,
                         max_steps=max_episode_steps,
                         eps_greedy=None)

            num_episodes += 1

            for dr in episode.get_rev_discounted_returns(gamma=gamma):
                (s_hash, a_desc, reward, sn_hash, G) = dr
                state_value_coll.mc_update(s_hash, alpha_obj(), G)

        abserr = state_value_coll.get_biggest_action_state_err()
        if abserr > max_abserr:
            keep_looping = True

        if num_episodes < min_num_episodes:
            keep_looping = True  # must loop for min_num_episodes at least

        pc_done = 100.0 * float(num_episodes) / float(max_num_episodes)
        out_str = '%i%%' % (5 * (int(pc_done / 5.0)))
        if out_str != progress_str:
            print(out_str, end=' ')
            progress_str = out_str

        if result_list == 'rms':
            resultL.append(state_value_coll.calc_rms_error(true_valueD))
        if result_list == 'abserr':
            resultL.append(abserr)
        else:
            pass  # don't save anything to resultL

    if value_snapshot_loopL is not None and num_episodes in value_snapshot_loopL:
        value_snapD[num_episodes] = state_value_coll.get_snapshot()

    if do_summ_print:
        s = ''
        if num_episodes >= max_num_episodes:
            s = '   (NOTE: STOPPED ON MAX-ITERATIONS)'

        print('Exited MC Every-Visit Policy Evaluation', s)
        print('   num episodes   =', num_episodes,
              ' (min limit=%i)' % min_num_episodes,
              ' (max limit=%i)' % max_num_episodes)
        print('   gamma          =', gamma)
        print('   estimated err  =', abserr)
        print('   Error limit    =', max_abserr)

        state_value_coll.summ_print(show_last_change=show_last_change,
                                    show_states=True)

    return resultL, value_snapD
示例#6
0
    def __init__(
        self,
        environment,
        learn_tracker=None,  # track progress of learning
        sa_linear_function=None,  # if input, use it.
        update_type='sarsa',  # can be 'sarsa', 'qlearn'
        read_pickle_file='',
        save_pickle_file='',
        do_summ_print=True,
        show_last_change=True,
        pcent_progress_print=10,
        show_banner=True,
        gamma=0.9,
        iteration_prints=0,
        max_episode_steps=sys.maxsize,
        epsilon=0.1,  # can be constant or EpsilonGreedy object
        alpha=0.1):  # can be constant or Alpha object
        """
        ... GIVEN AN ENVIRONMENT ... 
        Use basic SARSA or Qlearning algorithm to solve for linear approximation of
        STATE-ACTION VALUES, Q(s,a)
        
        Each action is forced to be a DETERMINISTIC action leading to one state and reward.
        (If the next state or reward changes, only the new values will be considered)
            
        attribute: self.action_value_linfunc is the linear approximation, Q(s,a) object
        
        A DETERMINISTIC policy can be created externally from the self.action_value_linfunc attribute.
        """
        self.environment = environment
        self.learn_tracker = learn_tracker
        self.save_pickle_file = save_pickle_file

        self.do_summ_print = do_summ_print
        self.show_last_change = show_last_change
        self.pcent_progress_print = pcent_progress_print

        self.gamma = gamma
        self.iteration_prints = iteration_prints
        self.max_episode_steps = max_episode_steps

        self.num_episodes = 0
        self.num_updates = 0

        # if input epsilon is a float, use it to create an EpsilonGreedy object
        if type(epsilon) == type(0.1):
            self.epsilon_obj = EpsilonGreedy(epsilon=epsilon,
                                             const_epsilon=True)
        else:
            self.epsilon_obj = epsilon

        # if input alpha is a float, use it to create an Alpha object
        if type(alpha) == type(0.1):
            self.alpha_obj = Alpha(alpha=alpha, const_alpha=True)
        else:
            self.alpha_obj = alpha

        # create the action_value_linfunc for the environment.
        self.action_value_linfunc = sa_linear_function
        self.update_type = update_type

        if read_pickle_file:
            self.action_value_linfunc.init_from_pickle_file(read_pickle_file)

        if do_summ_print:
            print(
                '================== EPSILON GREEDY DEFINED AS ========================'
            )
            self.epsilon_obj.summ_print()

            print(
                '================== LEARNING RATE DEFINED AS ========================'
            )
            self.alpha_obj.summ_print()

        if show_banner:
            s = 'Starting a Maximum of %i %s Semi-Gradient Epsilon Greedy Steps/Episode'%(self.max_episode_steps, update_type.upper()) +\
                '\nfor "%s" with Gamma = %g, Alpha = %g'%( environment.name, self.gamma, self.alpha_obj() )
            banner(s, banner_char='', leftMargin=0, just='center')
示例#7
0
def mc_first_visit_prediction( policy, state_value_ave, first_visit=True, 
                               do_summ_print=True, showRunningAve=False,
                               max_episode_steps=10000,
                               max_num_episodes=1000, min_num_episodes=10, 
                               max_abserr=0.001, gamma=0.9):
    """
    ... GIVEN A POLICY TO EVALUATE  apply Monte Carlo First Visit Prediction
    
    Use Episode Discounted Returns to find V(s), State-Value Function
    
    Terminates when abserr < max_abserr
    
    Assume that V(s), state_value_ave, has been initialized prior to call.
    (Note tht the StateValues object has a reference to the Environment object)
    
    Assume environment attached to policy will have method "get_any_action_state_hash"
    in order to begin at any start state.
    
    state_value_ave WILL BE CHANGED... policy WILL NOT.
    """
    
    # ==> Note: the reference to Environment object as "state_value_ave.environment"
    Env = state_value_ave.environment
    episode = Episode( Env.name + ' Episode' )
    
    if do_summ_print:
        print('=============== EVALUATING THE FOLLOWING POLICY ====================')
        policy.summ_print( verbosity=0, environment=Env, 
                   show_env_states=False, none_str='*')
                   
    s = 'Starting a Maximum of %i Monte Carlo All-Start-State Iterations\nGamma = %g'%(max_num_episodes, gamma)
    banner(s, banner_char='', leftMargin=0, just='center')
    
    keep_looping = True
       
    # value-iteration stopping criteria
    
    progress_str = ''
    num_episodes = 0
    
    while (num_episodes<=max_num_episodes-1) and keep_looping:
        
        keep_looping = False
        abserr = 0.0 # calculated below as part of termination criteria
        
        # policy evaluation 
        for start_hash in Env.iter_all_action_states( randomize=True ):
            
            # break from inner loop if max_num_episodes is hit.
            if num_episodes >= max_num_episodes:
                break
        
            make_episode(start_hash, policy, Env, Env.terminal_set, episode=episode,
                         max_steps=max_episode_steps, eps_greedy=None)
            
            num_episodes += 1
            
            for dr in episode.get_rev_discounted_returns( gamma=gamma, 
                                                          first_visit=first_visit, 
                                                          visit_type='S'):
                (s_hash, a_desc, reward, sn_hash, G) = dr
                state_value_ave.add_val( s_hash, G)
        
        abserr = state_value_ave.get_biggest_action_state_err()
        if abserr > max_abserr:
            keep_looping = True
            
        if num_episodes < min_num_episodes:
            keep_looping = True # must loop for min_num_episodes at least
                    
        pc_done = 100.0 * float(num_episodes) / float(max_num_episodes)
        out_str = '%i%%'%( 5*(int(pc_done/5.0) ) )
        if out_str != progress_str:
            print(out_str, end=' ')
            progress_str = out_str
            
    if do_summ_print:
        s = ''
        if num_episodes >= max_num_episodes:
            s = '   (NOTE: STOPPED ON MAX-ITERATIONS)'

        print( 'Exited MC First-Visit Policy Evaluation', s )
        print( '   num episodes   =', num_episodes, ' (min limit=%i)'%min_num_episodes, ' (max limit=%i)'%max_num_episodes )
        print( '   gamma          =', gamma )
        print( '   estimated err  =', abserr )
        print( '   Error limit    =', max_abserr )
    
        state_value_ave.summ_print( showRunningAve=showRunningAve, show_states=True)

    return abserr
示例#8
0
def mc_exploring_starts(environment,
                        initial_policy='default',
                        read_pickle_file='',
                        save_pickle_file='',
                        first_visit=True,
                        do_summ_print=True,
                        showRunningAve=False,
                        fmt_Q='%g',
                        fmt_R='%g',
                        show_initial_policy=True,
                        max_num_episodes=1000,
                        min_num_episodes=10,
                        max_abserr=0.001,
                        gamma=0.9,
                        max_episode_steps=10000,
                        iteration_prints=0):
    """
    ... GIVEN AN ENVIRONMENT ... 
    apply Monte Carlo Exploring Starts to find the OPTIMAL POLICY
    
    initial_policy can be 'default', 'random', policy_dictionary, Policy object
    
    Returns: Policy and ActionValueRunAveColl objects
    
    Use Episode Discounted Returns to find Q(s,a), Action-Value Function
    
    Terminates when abserr < max_abserr
    
    Assume that Q(s,a), action_value_ave, has been initialized prior to call.
    
    Assume environment attached to policy will have method "get_any_action_state_hash"
    in order to begin at any action state.
    
    CREATES BOTH policy AND action_value OBJECTS.
    """

    # create Policy and ActionValueRunAveColl objects
    policy = Policy(environment=environment)
    if initial_policy == 'default':
        print('Initializing Policy to "default" in mc_exploring_starts')
        policy.learn_a_legal_action_from_env(env=environment)
        policy.set_policy_from_piD(environment.get_default_policy_desc_dict())
    elif initial_policy == 'random':
        print('Initializing Policy to "random" in mc_exploring_starts')
        policy.intialize_policy_to_random(env=environment)
    elif isinstance(initial_policy, Policy):
        policy = initial_policy
    else:
        print('Initializing Policy to "custom policy" in mc_exploring_starts')
        policy.learn_a_legal_action_from_env(env=environment)
        policy.set_policy_from_piD(initial_policy)

    action_value_ave = ActionValueRunAveColl(environment)
    action_value_ave.init_Qsa_to_zero(
    )  # Terminal states w/o an action are NOT included
    #action_value_ave.summ_print()

    if read_pickle_file:
        policy.init_from_pickle_file(read_pickle_file)
        action_value_ave.init_from_pickle_file(read_pickle_file)

    if do_summ_print:
        if show_initial_policy:
            print(
                '=============== STARTING WITH THE INITIAL POLICY ===================='
            )
            policy.summ_print(verbosity=0,
                              environment=environment,
                              show_env_states=False,
                              none_str='*')


    s = 'Starting a Maximum of %i Monte Carlo Exploring Start Episodes\nfor "%s" with Gamma = %g'%\
        (max_num_episodes, environment.name, gamma)
    banner(s, banner_char='', leftMargin=0, just='center')

    # create an Episode object for getting returns
    episode = Episode(environment.name + ' Episode')

    # set counter and flag
    num_episodes = 0
    keep_looping = True

    progress_str = ''
    while (num_episodes <= max_num_episodes - 1) and keep_looping:

        keep_looping = False
        abserr = 0.0  # calculated below as part of termination criteria

        for start_hash in environment.iter_all_action_states(randomize=True):
            a_descL = environment.get_state_legal_action_list(start_hash)
            # randomize action order
            random.shuffle(a_descL)
            # try every initial action for each start_hash
            for a_desc in a_descL:

                # break from inner loop if max_num_episodes is hit.
                if num_episodes >= max_num_episodes:
                    break

                make_episode(start_hash,
                             policy,
                             environment,
                             environment.terminal_set,
                             episode=episode,
                             first_a_desc=a_desc,
                             max_steps=max_episode_steps,
                             eps_greedy=None)

                num_episodes += 1

                for dr in episode.get_rev_discounted_returns(
                        gamma=gamma, first_visit=first_visit, visit_type='SA'):
                    # look at each step from episode and calc average Q(s,a)
                    (s, a, r, sn, G) = dr
                    action_value_ave.add_val(s, a, G)

                    aL = environment.get_state_legal_action_list(s)
                    if aL:
                        best_a_desc, best_a_val = aL[0], float('-inf')
                        bestL = [best_a_desc]
                        for a in aL:
                            q = action_value_ave.get_ave(s, a)
                            if q > best_a_val:
                                best_a_desc, best_a_val = a, q
                                bestL = [a]
                            elif q == best_a_val:
                                bestL.append(a)
                        best_a_desc = random.choice(bestL)
                        policy.set_sole_action(s, best_a_desc)

        abserr = action_value_ave.get_biggest_action_state_err()
        if abserr > max_abserr:
            keep_looping = True

        if num_episodes < min_num_episodes:
            keep_looping = True  # must loop for min_num_episodes at least

        pc_done = 100.0 * float(num_episodes) / float(max_num_episodes)
        out_str = '%3i%%' % (5 * (int(pc_done / 5.0)))
        if out_str != progress_str:
            score = environment.get_policy_score(policy=policy,
                                                 start_state_hash=None,
                                                 step_limit=1000)
            print(out_str, ' score=%s' % str(score),
                  ' = (r_sum, n_steps, msg)', '   estimated err =', abserr)
            progress_str = out_str

    if do_summ_print:
        s = ''
        if num_episodes >= max_num_episodes:
            s = '   (NOTE: STOPPED ON MAX-ITERATIONS)'

        print('Exited MC First-Visit Value Iteration', s)
        print('   num episodes   =', num_episodes,
              ' (min limit=%i)' % min_num_episodes,
              ' (max limit=%i)' % max_num_episodes)
        print('   gamma          =', gamma)
        print('   estimated err  =', abserr)
        print('   Error limit    =', max_abserr)

        action_value_ave.summ_print(showRunningAve=showRunningAve, fmt_Q=fmt_Q)
        policy.summ_print(environment=environment,
                          verbosity=0,
                          show_env_states=False)

        try:  # sims may not have a layout_print
            environment.layout_print(vname='reward',
                                     fmt=fmt_R,
                                     show_env_states=False,
                                     none_str='*')
        except:
            pass

    if save_pickle_file:
        policy.save_to_pickle_file(save_pickle_file)
        action_value_ave.save_to_pickle_file(save_pickle_file)

    return policy, action_value_ave
示例#9
0
    def __init__(
        self,
        environment,
        learn_tracker=None,  # track progress of learning
        initial_Qsa=0.0,  # init non-terminal_set of V(s) (terminal_set=0.0)
        initial_action_value_coll=None,  # if input, use it.
        read_pickle_file='',
        save_pickle_file='',
        do_summ_print=True,
        show_last_change=True,
        pcent_progress_print=10,
        show_banner=True,
        gamma=0.9,
        iteration_prints=0,
        max_episode_steps=sys.maxsize,
        epsilon=0.1,  # can be constant or EpsilonGreedy object
        alpha=0.1):  # can be constant or Alpha object
        """
        ... GIVEN AN ENVIRONMENT ... 
        Use basic Dyna-Q algorithm to solve for STATE-ACTION VALUES, Q(s,a)
        
        Each action is forced to be a DETERMINISTIC action leading to one state and reward.
        (If the next state or reward changes, only the new values will be considered)
            
        attribute: self.action_value_coll is the ActionValueColl, Q(s,a) object
        
        A DETERMINISTIC policy can be created externally from the self.action_value_coll attribute.
        """
        self.environment = environment
        self.learn_tracker = learn_tracker
        self.save_pickle_file = save_pickle_file

        self.do_summ_print = do_summ_print
        self.show_last_change = show_last_change
        self.pcent_progress_print = pcent_progress_print

        self.gamma = gamma
        self.iteration_prints = iteration_prints
        self.max_episode_steps = max_episode_steps

        self.num_episodes = 0
        self.num_updates = 0

        # if input epsilon is a float, use it to create an EpsilonGreedy object
        if type(epsilon) == type(0.1):
            self.epsilon_obj = EpsilonGreedy(epsilon=epsilon,
                                             const_epsilon=True)
        else:
            self.epsilon_obj = epsilon

        # if input alpha is a float, use it to create an Alpha object
        if type(alpha) == type(0.1):
            self.alpha_obj = Alpha(alpha=alpha, const_alpha=True)
        else:
            self.alpha_obj = alpha

        # create the action_value_coll for the environment.
        if initial_action_value_coll is None:
            self.action_value_coll = ActionValueColl(environment,
                                                     init_val=initial_Qsa)
        else:
            self.action_value_coll = initial_action_value_coll

        if read_pickle_file:
            self.action_value_coll.init_from_pickle_file(read_pickle_file)

        # initialize the model that will build from experience
        # do not build full model description on Model init, states not visited
        #  by the RL portion will have no returns values.
        self.model = Model(environment, build_initial_model=False)
        #for s_hash, aD in self.action_value_coll.QsaD.items():
        #    for a_desc, Q in aD.items():
        #        self.model.add_action( s_hash, a_desc )

        if do_summ_print:
            print(
                '================== EPSILON GREEDY DEFINED AS ========================'
            )
            self.epsilon_obj.summ_print()

            print(
                '================== LEARNING RATE DEFINED AS ========================'
            )
            self.alpha_obj.summ_print()

        if show_banner:
            s = 'Starting a Maximum of %i Dyna-Q Epsilon Greedy Steps/Episode'%self.max_episode_steps +\
                '\nfor "%s" with Gamma = %g, Alpha = %g'%( environment.name, self.gamma, self.alpha_obj() )
            banner(s, banner_char='', leftMargin=0, just='center')