Exemplo n.º 1
0
from introrl.policy import Policy
from introrl.agent_supt.state_value_coll import StateValueColl
from introrl.agent_supt.nstep_td_eval_walker import NStepTDWalker
from introrl.mdp_data.random_walk_generic_mrp import get_random_walk
from introrl.agent_supt.episode_maker import make_episode

GAMMA=1.0

AVE_OVER = 100

rw_mrp = get_random_walk(Nside_states=9, win_reward=1.0, lose_reward=-1.0, step_reward=0.0)
policy = Policy( environment=rw_mrp )

policy.intialize_policy_to_equiprobable() # should be equiprobable from above init already

episode_obj = make_episode( 'C', policy, rw_mrp )

fig, ax = plt.subplots()

# ---------------- set up true value data for RMS calc --------------------
true_valueD = {'C':0.0} # { 'Win':0.0, 'Lose':0.0}

#print('rw_mrp.get_num_states() = ',rw_mrp.get_num_states())
delta = 2.0 / (rw_mrp.get_num_states()-1)
Nsides = int( rw_mrp.get_num_states() / 2) - 1
d = 0.0
for i in range(1, Nsides+1 ):
    d += delta
    true_valueD[ 'L-%i'%i] = float('%g'%-d) # I got mad about the small bits.
    true_valueD[ 'R+%i'%i] = float('%g'%d)
Exemplo n.º 2
0
                          epsilon=0.1, const_epsilon=True,
                          alpha=0.5, const_alpha=True)

print('_' * 55)
score = gridworld.get_policy_score(policy,
                                   start_state_hash=None,
                                   step_limit=1000)
print('Policy Score =', score, ' = (r_sum, n_steps, msg)')

steps_per_episodeL = learn_tracker.steps_per_episode()

print(gridworld.get_info())

episode = make_episode(gridworld.start_state_hash,
                       policy,
                       gridworld,
                       gridworld.terminal_set,
                       max_steps=20)

epi_summ_print(episode,
               policy,
               gridworld,
               show_rewards=False,
               show_env_states=True,
               none_str='*')

fig, ax = plt.subplots()
plt.title('SARSA Windy Gridworld')
if 1:
    plt.xlabel('Time Steps')
    plt.ylabel('Episodes')
Exemplo n.º 3
0
    from introrl.agent_supt.action_value_coll import ActionValueColl

    gridworld = get_gridworld()
    sv = ActionValueColl(gridworld)

    pi = Policy(environment=gridworld)

    pi.set_policy_from_piD(gridworld.get_default_policy_desc_dict())
    #pi.summ_print()

    eg = EpsilonGreedy(epsilon=0.5,
                       const_epsilon=True,
                       half_life=200,
                       N_episodes_wo_decay=0)

    episode_obj = make_episode((2, 0), pi, gridworld, eps_greedy=None)
    """environment, Nsteps=3, 
                 policy=None, episode_obj=None, 
                 terminal_set=None,
                 max_steps=sys.maxsize, eps_greedy=None"""

    print('Using an episode_obj')
    episode_obj.summ_print()
    print('                ...')
    NSW = NStepTDWalker(gridworld, Nsteps=16, episode_obj=episode_obj)
    NSW.do_sarsa_action_value_updates(sv,
                                      alpha=0.1,
                                      gamma=0.9,
                                      start_state_hash=None)
    #print()
    #gridworld.summ_print()
Exemplo n.º 4
0
from introrl.black_box_sims.random_walk_1000 import RandomWalk_1000Simulation
from introrl.agent_supt.episode_maker import make_episode
from introrl.policy import Policy

NUM_EPISODES = 100000
countD = {} # index=state, value=count 

RW = RandomWalk_1000Simulation()
policy = Policy(environment=RW)
policy.intialize_policy_to_equiprobable( env=RW )


for Nepi in range(NUM_EPISODES):
    episode = make_episode(500, policy, RW, max_steps=10000)
    
    for dr in episode.get_rev_discounted_returns( gamma=1.0 ):
        (s_hash, a_desc, reward, sn_hash, G) = dr
        
        countD[ s_hash ] = countD.get( s_hash, 0 ) + 1

SUM_VISITS = sum( list(countD.values()) )
freqL = []
for i in range(1,1001):
    freqL.append( countD.get(i,0) / float(SUM_VISITS) )

# copy and paste list into plot script
print('freqL =', repr(freqL))
Exemplo n.º 5
0
        sarsa_epsilon_greedy( sim,
                              initial_Qsa=0.0, # init non-terminal_set of V(s) (terminal_set=0.0)
                              read_pickle_file='',
                              save_pickle_file='',
                              use_list_of_start_states=False, # use list OR single start state of environment.
                              do_summ_print=True, show_last_change=True, fmt_Q='%g', fmt_R='%g',
                              max_num_episodes=500, min_num_episodes=10, max_abserr=0.001, gamma=1.0,
                              iteration_prints=0,
                              max_episode_steps=1000,
                              epsilon=0.1, const_epsilon=True, epsilon_half_life=200,
                              alpha=0.1, const_alpha=True, alpha_half_life=200,
                              N_episodes_wo_decay=0)

    episode = make_episode(sim.start_state_hash,
                           policy,
                           sim,
                           sim.terminal_set,
                           max_steps=20)
    epi_summ_print(episode,
                   policy,
                   sim,
                   show_rewards=False,
                   show_env_states=True,
                   none_str='*')

    sim.random_transition_prob = 0.0  # so arrows are drawn deterministically on policy diagram
    policy.save_diagram(sim,
                        inp_colorD=None,
                        save_name='sample_sim_policy',
                        show_arrows=True,
                        scale=1.0,
Exemplo n.º 6
0
    from introrl.agent_supt.episode_maker import make_episode
    from introrl.agent_supt.episode_summ_print import epi_summ_print

    MB = MaximizationBiasMDP()
    MB.layout.s_hash_print(none_str='*')

    policy, state_value = \
        qlearning_epsilon_greedy( MB,
                              initial_Qsa=0.0, # init non-terminal_set of V(s) (terminal_set=0.0)
                              use_list_of_start_states=False, # use list OR single start state of environment.
                              do_summ_print=True, show_last_change=True, fmt_Q='%g', fmt_R='%g',
                              pcent_progress_print=0,
                              show_banner = True,
                              max_num_episodes=10, min_num_episodes=10, max_abserr=0.001,
                              gamma=1.0,
                              max_episode_steps=100,
                              epsilon=0.1,
                              alpha=0.1)

    episode = make_episode(MB.start_state_hash,
                           policy,
                           MB,
                           MB.terminal_set,
                           max_steps=20)
    epi_summ_print(episode,
                   policy,
                   MB,
                   show_rewards=False,
                   show_env_states=True,
                   none_str='*')
Exemplo n.º 7
0
alpha_mc = 0.02
gamma = 1.0

true_valueD = {'A':1.0/6.0, 'B':2.0/6.0, 'C':3.0/6.0, 'D':4.0/6.0, 'E':5.0/6.0}

for o_loop in range(1,101):
    print('%2i'%o_loop, end=' ')
    if o_loop % 20 == 0:
        print()
            
    # make 2 state value objects.
    sv_td = StateValueColl( rw_mrp, init_val=0.5 )
    sv_mc = StateValueColl( rw_mrp, init_val=0.5 )
    for i_loop in range(NumEpisodes):
                    
        episode = make_episode('C', policy, rw_mrp, rw_mrp.terminal_set)
            
        for dr in episode.get_rev_discounted_returns( gamma=gamma ):
            (s_hash, a_desc, reward, sn_hash, G) = dr
            
            sv_mc.mc_update( s_hash, alpha_mc, G)
        
            sv_td.td0_update( s_hash=s_hash, alpha=alpha_td, 
                                    gamma=gamma, sn_hash=sn_hash, 
                                    reward=reward)
                
        # add this loops state values to running_ave
        mc_rms_raveL[i_loop].add_val( sv_mc.calc_rms_error( true_valueD ) )
        td_rms_raveL[i_loop].add_val( sv_td.calc_rms_error( true_valueD ) )

mc_rmsL = [R.get_ave() for R in mc_rms_raveL]
Exemplo n.º 8
0
def mc_every_visit_prediction(
    policy,
    state_value_coll,
    all_start_states=False,
    do_summ_print=True,
    show_last_change=True,
    show_banner=True,
    max_episode_steps=10000,
    alpha=0.1,
    const_alpha=True,
    alpha_half_life=200,
    max_num_episodes=1000,
    min_num_episodes=10,
    max_abserr=0.001,
    gamma=0.9,
    result_list='abserr',
    true_valueD=None,
    value_snapshot_loopL=None
):  # if input, save V(s) snapshot at iteration steps indicated
    """
    ... GIVEN A POLICY TO EVALUATE  apply Monte Carlo Every Visit Prediction
    
    Use Episode Discounted Returns to find V(s), State-Value Function
    
    Terminates when abserr < max_abserr
    
    Assume that V(s), state_value_coll, has been initialized prior to call.
    (Note tht the StateValues object has a reference to the Environment object)
    
    Assume environment attached to policy will have method "get_any_action_state_hash"
    in order to begin at any start state.
    
    state_value_coll WILL BE CHANGED... policy WILL NOT.
    """

    resultL = []  # based on result_list, can be "rms" or "abserr"
    value_snapD = {}  # index=loop counter, value=dict of {s_hash:Vs, ...}

    # ==> Note: the reference to Environment object as "state_value_coll.environment"
    Env = state_value_coll.environment
    episode = Episode(Env.name + ' Episode')

    alpha_obj = Alpha(alpha=alpha,
                      const_alpha=const_alpha,
                      half_life=alpha_half_life)

    if do_summ_print:
        print(
            '=============== EVALUATING THE FOLLOWING POLICY ===================='
        )
        policy.summ_print(verbosity=0,
                          environment=Env,
                          show_env_states=False,
                          none_str='*')

    if all_start_states:
        s = 'Starting a Maximum of %i Monte Carlo All-Start-State Iterations\nGamma = %g' % (
            max_num_episodes, gamma)
        start_stateL = [s_hash for s_hash in Env.iter_all_action_states()]
    else:
        s = 'Starting a Maximum of %i Monte Carlo Iterations from state "%s"\nGamma = %g' % (
            max_num_episodes, str(Env.start_state_hash), gamma)
        start_stateL = [Env.start_state_hash]

    if show_banner:
        banner(s, banner_char='', leftMargin=0, just='center')

    num_episodes = 0
    keep_looping = True

    # value-iteration stopping criteria

    progress_str = ''
    while (num_episodes <= max_num_episodes - 1) and keep_looping:

        keep_looping = False
        abserr = 0.0  # calculated below as part of termination criteria

        # policy evaluation
        random.shuffle(start_stateL)
        for start_hash in start_stateL:

            # break from inner loop if max_num_episodes is hit.
            if num_episodes >= max_num_episodes:
                break

            make_episode(start_hash,
                         policy,
                         Env,
                         Env.terminal_set,
                         episode=episode,
                         max_steps=max_episode_steps,
                         eps_greedy=None)

            num_episodes += 1

            for dr in episode.get_rev_discounted_returns(gamma=gamma):
                (s_hash, a_desc, reward, sn_hash, G) = dr
                state_value_coll.mc_update(s_hash, alpha_obj(), G)

        abserr = state_value_coll.get_biggest_action_state_err()
        if abserr > max_abserr:
            keep_looping = True

        if num_episodes < min_num_episodes:
            keep_looping = True  # must loop for min_num_episodes at least

        pc_done = 100.0 * float(num_episodes) / float(max_num_episodes)
        out_str = '%i%%' % (5 * (int(pc_done / 5.0)))
        if out_str != progress_str:
            print(out_str, end=' ')
            progress_str = out_str

        if result_list == 'rms':
            resultL.append(state_value_coll.calc_rms_error(true_valueD))
        if result_list == 'abserr':
            resultL.append(abserr)
        else:
            pass  # don't save anything to resultL

    if value_snapshot_loopL is not None and num_episodes in value_snapshot_loopL:
        value_snapD[num_episodes] = state_value_coll.get_snapshot()

    if do_summ_print:
        s = ''
        if num_episodes >= max_num_episodes:
            s = '   (NOTE: STOPPED ON MAX-ITERATIONS)'

        print('Exited MC Every-Visit Policy Evaluation', s)
        print('   num episodes   =', num_episodes,
              ' (min limit=%i)' % min_num_episodes,
              ' (max limit=%i)' % max_num_episodes)
        print('   gamma          =', gamma)
        print('   estimated err  =', abserr)
        print('   Error limit    =', max_abserr)

        state_value_coll.summ_print(show_last_change=show_last_change,
                                    show_states=True)

    return resultL, value_snapD
Exemplo n.º 9
0
def mc_first_visit_prediction( policy, state_value_ave, first_visit=True, 
                               do_summ_print=True, showRunningAve=False,
                               max_episode_steps=10000,
                               max_num_episodes=1000, min_num_episodes=10, 
                               max_abserr=0.001, gamma=0.9):
    """
    ... GIVEN A POLICY TO EVALUATE  apply Monte Carlo First Visit Prediction
    
    Use Episode Discounted Returns to find V(s), State-Value Function
    
    Terminates when abserr < max_abserr
    
    Assume that V(s), state_value_ave, has been initialized prior to call.
    (Note tht the StateValues object has a reference to the Environment object)
    
    Assume environment attached to policy will have method "get_any_action_state_hash"
    in order to begin at any start state.
    
    state_value_ave WILL BE CHANGED... policy WILL NOT.
    """
    
    # ==> Note: the reference to Environment object as "state_value_ave.environment"
    Env = state_value_ave.environment
    episode = Episode( Env.name + ' Episode' )
    
    if do_summ_print:
        print('=============== EVALUATING THE FOLLOWING POLICY ====================')
        policy.summ_print( verbosity=0, environment=Env, 
                   show_env_states=False, none_str='*')
                   
    s = 'Starting a Maximum of %i Monte Carlo All-Start-State Iterations\nGamma = %g'%(max_num_episodes, gamma)
    banner(s, banner_char='', leftMargin=0, just='center')
    
    keep_looping = True
       
    # value-iteration stopping criteria
    
    progress_str = ''
    num_episodes = 0
    
    while (num_episodes<=max_num_episodes-1) and keep_looping:
        
        keep_looping = False
        abserr = 0.0 # calculated below as part of termination criteria
        
        # policy evaluation 
        for start_hash in Env.iter_all_action_states( randomize=True ):
            
            # break from inner loop if max_num_episodes is hit.
            if num_episodes >= max_num_episodes:
                break
        
            make_episode(start_hash, policy, Env, Env.terminal_set, episode=episode,
                         max_steps=max_episode_steps, eps_greedy=None)
            
            num_episodes += 1
            
            for dr in episode.get_rev_discounted_returns( gamma=gamma, 
                                                          first_visit=first_visit, 
                                                          visit_type='S'):
                (s_hash, a_desc, reward, sn_hash, G) = dr
                state_value_ave.add_val( s_hash, G)
        
        abserr = state_value_ave.get_biggest_action_state_err()
        if abserr > max_abserr:
            keep_looping = True
            
        if num_episodes < min_num_episodes:
            keep_looping = True # must loop for min_num_episodes at least
                    
        pc_done = 100.0 * float(num_episodes) / float(max_num_episodes)
        out_str = '%i%%'%( 5*(int(pc_done/5.0) ) )
        if out_str != progress_str:
            print(out_str, end=' ')
            progress_str = out_str
            
    if do_summ_print:
        s = ''
        if num_episodes >= max_num_episodes:
            s = '   (NOTE: STOPPED ON MAX-ITERATIONS)'

        print( 'Exited MC First-Visit Policy Evaluation', s )
        print( '   num episodes   =', num_episodes, ' (min limit=%i)'%min_num_episodes, ' (max limit=%i)'%max_num_episodes )
        print( '   gamma          =', gamma )
        print( '   estimated err  =', abserr )
        print( '   Error limit    =', max_abserr )
    
        state_value_ave.summ_print( showRunningAve=showRunningAve, show_states=True)

    return abserr
Exemplo n.º 10
0
def mc_exploring_starts(environment,
                        initial_policy='default',
                        read_pickle_file='',
                        save_pickle_file='',
                        first_visit=True,
                        do_summ_print=True,
                        showRunningAve=False,
                        fmt_Q='%g',
                        fmt_R='%g',
                        show_initial_policy=True,
                        max_num_episodes=1000,
                        min_num_episodes=10,
                        max_abserr=0.001,
                        gamma=0.9,
                        max_episode_steps=10000,
                        iteration_prints=0):
    """
    ... GIVEN AN ENVIRONMENT ... 
    apply Monte Carlo Exploring Starts to find the OPTIMAL POLICY
    
    initial_policy can be 'default', 'random', policy_dictionary, Policy object
    
    Returns: Policy and ActionValueRunAveColl objects
    
    Use Episode Discounted Returns to find Q(s,a), Action-Value Function
    
    Terminates when abserr < max_abserr
    
    Assume that Q(s,a), action_value_ave, has been initialized prior to call.
    
    Assume environment attached to policy will have method "get_any_action_state_hash"
    in order to begin at any action state.
    
    CREATES BOTH policy AND action_value OBJECTS.
    """

    # create Policy and ActionValueRunAveColl objects
    policy = Policy(environment=environment)
    if initial_policy == 'default':
        print('Initializing Policy to "default" in mc_exploring_starts')
        policy.learn_a_legal_action_from_env(env=environment)
        policy.set_policy_from_piD(environment.get_default_policy_desc_dict())
    elif initial_policy == 'random':
        print('Initializing Policy to "random" in mc_exploring_starts')
        policy.intialize_policy_to_random(env=environment)
    elif isinstance(initial_policy, Policy):
        policy = initial_policy
    else:
        print('Initializing Policy to "custom policy" in mc_exploring_starts')
        policy.learn_a_legal_action_from_env(env=environment)
        policy.set_policy_from_piD(initial_policy)

    action_value_ave = ActionValueRunAveColl(environment)
    action_value_ave.init_Qsa_to_zero(
    )  # Terminal states w/o an action are NOT included
    #action_value_ave.summ_print()

    if read_pickle_file:
        policy.init_from_pickle_file(read_pickle_file)
        action_value_ave.init_from_pickle_file(read_pickle_file)

    if do_summ_print:
        if show_initial_policy:
            print(
                '=============== STARTING WITH THE INITIAL POLICY ===================='
            )
            policy.summ_print(verbosity=0,
                              environment=environment,
                              show_env_states=False,
                              none_str='*')


    s = 'Starting a Maximum of %i Monte Carlo Exploring Start Episodes\nfor "%s" with Gamma = %g'%\
        (max_num_episodes, environment.name, gamma)
    banner(s, banner_char='', leftMargin=0, just='center')

    # create an Episode object for getting returns
    episode = Episode(environment.name + ' Episode')

    # set counter and flag
    num_episodes = 0
    keep_looping = True

    progress_str = ''
    while (num_episodes <= max_num_episodes - 1) and keep_looping:

        keep_looping = False
        abserr = 0.0  # calculated below as part of termination criteria

        for start_hash in environment.iter_all_action_states(randomize=True):
            a_descL = environment.get_state_legal_action_list(start_hash)
            # randomize action order
            random.shuffle(a_descL)
            # try every initial action for each start_hash
            for a_desc in a_descL:

                # break from inner loop if max_num_episodes is hit.
                if num_episodes >= max_num_episodes:
                    break

                make_episode(start_hash,
                             policy,
                             environment,
                             environment.terminal_set,
                             episode=episode,
                             first_a_desc=a_desc,
                             max_steps=max_episode_steps,
                             eps_greedy=None)

                num_episodes += 1

                for dr in episode.get_rev_discounted_returns(
                        gamma=gamma, first_visit=first_visit, visit_type='SA'):
                    # look at each step from episode and calc average Q(s,a)
                    (s, a, r, sn, G) = dr
                    action_value_ave.add_val(s, a, G)

                    aL = environment.get_state_legal_action_list(s)
                    if aL:
                        best_a_desc, best_a_val = aL[0], float('-inf')
                        bestL = [best_a_desc]
                        for a in aL:
                            q = action_value_ave.get_ave(s, a)
                            if q > best_a_val:
                                best_a_desc, best_a_val = a, q
                                bestL = [a]
                            elif q == best_a_val:
                                bestL.append(a)
                        best_a_desc = random.choice(bestL)
                        policy.set_sole_action(s, best_a_desc)

        abserr = action_value_ave.get_biggest_action_state_err()
        if abserr > max_abserr:
            keep_looping = True

        if num_episodes < min_num_episodes:
            keep_looping = True  # must loop for min_num_episodes at least

        pc_done = 100.0 * float(num_episodes) / float(max_num_episodes)
        out_str = '%3i%%' % (5 * (int(pc_done / 5.0)))
        if out_str != progress_str:
            score = environment.get_policy_score(policy=policy,
                                                 start_state_hash=None,
                                                 step_limit=1000)
            print(out_str, ' score=%s' % str(score),
                  ' = (r_sum, n_steps, msg)', '   estimated err =', abserr)
            progress_str = out_str

    if do_summ_print:
        s = ''
        if num_episodes >= max_num_episodes:
            s = '   (NOTE: STOPPED ON MAX-ITERATIONS)'

        print('Exited MC First-Visit Value Iteration', s)
        print('   num episodes   =', num_episodes,
              ' (min limit=%i)' % min_num_episodes,
              ' (max limit=%i)' % max_num_episodes)
        print('   gamma          =', gamma)
        print('   estimated err  =', abserr)
        print('   Error limit    =', max_abserr)

        action_value_ave.summ_print(showRunningAve=showRunningAve, fmt_Q=fmt_Q)
        policy.summ_print(environment=environment,
                          verbosity=0,
                          show_env_states=False)

        try:  # sims may not have a layout_print
            environment.layout_print(vname='reward',
                                     fmt=fmt_R,
                                     show_env_states=False,
                                     none_str='*')
        except:
            pass

    if save_pickle_file:
        policy.save_to_pickle_file(save_pickle_file)
        action_value_ave.save_to_pickle_file(save_pickle_file)

    return policy, action_value_ave
Exemplo n.º 11
0
    gridworld = get_gridworld()

    pi = Policy(environment=gridworld)

    pi.set_policy_from_piD(gridworld.get_default_policy_desc_dict())
    #pi.summ_print()

    eg = EpsilonGreedy(epsilon=0.2,
                       const_epsilon=True,
                       half_life=200,
                       N_episodes_wo_decay=0)

    episode = make_episode((2, 0),
                           pi,
                           gridworld,
                           gridworld.terminal_set,
                           eps_greedy=eg)

    episode.summ_print()

    epi_summ_print(episode,
                   pi,
                   gridworld,
                   show_rewards=True,
                   show_env_states=True,
                   none_str='*')

    epi_summ_print(episode,
                   pi,
                   gridworld,