Пример #1
0
    def get_policy(self):

        policy = Policy(environment=self.environment)
        for s_hash in self.environment.iter_all_action_states():
            a_desc = self.get_best_greedy_action(s_hash)
            policy.set_sole_action(s_hash, a_desc)
        return policy
Пример #2
0
def qlearning_epsilon_greedy(
        environment,
        learn_tracker=None,  # track progress of learning
        initial_Qsa=0.0,  # init non-terminal_set of V(s) (terminal_set=0.0)
        initial_action_value_coll=None,  # if input, use it.
        read_pickle_file='',
        save_pickle_file='',
        use_list_of_start_states=False,  # use list OR single start state of environment.
        do_summ_print=True,
        show_last_change=True,
        fmt_Q='%g',
        fmt_R='%g',
        pcent_progress_print=10,
        show_banner=True,
        max_num_episodes=sys.maxsize,
        min_num_episodes=10,
        max_abserr=0.001,
        gamma=0.9,
        iteration_prints=0,
        max_episode_steps=sys.maxsize,
        epsilon=0.1,
        const_epsilon=True,
        epsilon_half_life=200,
        alpha=0.1,
        const_alpha=True,
        alpha_half_life=200,
        N_episodes_wo_decay=0):
    """
    ... GIVEN AN ENVIRONMENT ... 
    apply Q-Learning Temporal Difference to find the OPTIMAL POLICY and STATE VALUES
    
    Returns: Policy and ActionValueColl objects
    
    Use Episode Discounted Returns to find V(s), State-Value Function
    
    Terminates when abserr < max_abserr
    
    Assume environment attached to policy will have method "get_any_action_state_hash"
    in order to begin at any action state.
    
    CREATES BOTH policy AND action_value_coll OBJECTS.
    """

    # create EpsilonGreedy, Alpha and ActionValueColl objects
    eg = EpsilonGreedy(epsilon=epsilon,
                       const_epsilon=const_epsilon,
                       half_life=epsilon_half_life,
                       N_episodes_wo_decay=N_episodes_wo_decay)

    alpha_obj = Alpha(alpha=alpha,
                      const_alpha=const_alpha,
                      half_life=alpha_half_life)

    if initial_action_value_coll is None:
        action_value_coll = ActionValueColl(environment, init_val=initial_Qsa)
    else:
        action_value_coll = initial_action_value_coll
    #action_value_coll.summ_print()
    num_s_hash = len(environment.get_all_action_state_hashes())

    if read_pickle_file:
        action_value_coll.init_from_pickle_file(read_pickle_file)

    if do_summ_print:
        print(
            '================== EPSILON GREEDY DEFINED AS ========================'
        )
        eg.summ_print()

        print(
            '================== LEARNING RATE DEFINED AS ========================'
        )
        alpha_obj.summ_print()

    if show_banner:
        s = 'Starting a Maximum of %i Q-Learning Epsilon Greedy Episodes'%max_num_episodes +\
            '\nfor "%s" with Gamma = %g, Alpha = %g'%( environment.name, gamma, alpha_obj() )
        banner(s, banner_char='', leftMargin=0, just='center')

    # Iterate over a list of known possible start states
    if use_list_of_start_states:
        loop_stateL = environment.limited_start_state_list()
    else:
        loop_stateL = [environment.start_state_hash]

    if show_banner:
        print(
            '======================= Iterating over Start States =================================='
        )
        print(loop_stateL)
        print(
            '======================================================================================'
        )

    # set counter and flag
    episode_loop_counter = 0
    keep_looping = True

    progress_str = ''
    while (episode_loop_counter <= max_num_episodes - 1) and keep_looping:
        keep_looping = False
        abserr = 0.0  # calculated below as part of termination criteria
        Nterminal_episodes = set(
        )  # tracks if ended at terminal_set or max_num_episodes

        for start_hash in loop_stateL:
            episode_loop_counter += 1
            if episode_loop_counter > max_num_episodes:
                break

            if learn_tracker is not None:
                learn_tracker.add_new_episode()
            s_hash = start_hash

            for n_episode_steps in range(max_episode_steps):
                a_desc = action_value_coll.get_best_eps_greedy_action(
                    s_hash, epsgreedy_obj=eg)

                # Begin an episode
                if a_desc is None:
                    Nterminal_episodes.add(start_hash)
                    print('break for a_desc==None')
                    break
                else:
                    sn_hash, reward = environment.get_action_snext_reward(
                        s_hash, a_desc)
                    if learn_tracker is not None:
                        learn_tracker.add_sarsn_to_current_episode(
                            s_hash, a_desc, reward, sn_hash)

                    if sn_hash is None:
                        Nterminal_episodes.add(start_hash)
                        print('break for sn_hash==None, #steps=',
                              n_episode_steps, ' s_hash=%s' % str(s_hash),
                              ' a_desc=%s' % str(a_desc))
                        break
                    else:
                        action_value_coll.qlearning_update(s_hash=s_hash,
                                                           a_desc=a_desc,
                                                           sn_hash=sn_hash,
                                                           alpha=alpha_obj(),
                                                           gamma=gamma,
                                                           reward=reward)
                        if sn_hash in environment.terminal_set:
                            Nterminal_episodes.add(start_hash)
                            if (n_episode_steps == 0) and (num_s_hash > 2):
                                print(
                                    '1st step break for sn_hash in terminal_set',
                                    sn_hash, ' s_hash=%s' % str(s_hash),
                                    ' a_desc=%s' % str(a_desc))
                            break
                        s_hash = sn_hash

        # increment episode counter on EpsilonGreedy and Alpha objects
        eg.inc_N_episodes()
        alpha_obj.inc_N_episodes()

        abserr = action_value_coll.get_biggest_action_state_err()
        if abserr > max_abserr:
            keep_looping = True

        if episode_loop_counter < min_num_episodes:
            keep_looping = True  # must loop for min_num_episodes at least

        pc_done = 100.0 * float(episode_loop_counter) / float(max_num_episodes)

        if pcent_progress_print > 0:
            out_str = '%3i%%' % (pcent_progress_print *
                                 (int(pc_done / float(pcent_progress_print))))
        else:
            out_str = progress_str

        if out_str != progress_str:
            print(out_str, end=' ')
            print('Nterminal episodes =', len(Nterminal_episodes), ' of ',
                  len(loop_stateL))
            progress_str = out_str

    policy = Policy(environment=environment)
    for s_hash in environment.iter_all_action_states():
        a_desc = action_value_coll.get_best_eps_greedy_action(
            s_hash, epsgreedy_obj=None)
        policy.set_sole_action(s_hash, a_desc)

    if do_summ_print:
        s = ''
        if episode_loop_counter >= max_num_episodes:
            s = '   (NOTE: STOPPED ON MAX-ITERATIONS)'

        print('Exited Epsilon Greedy, TD(0) Value Iteration', s)
        print('   # episodes      =', episode_loop_counter,
              ' (min limit=%i)' % min_num_episodes,
              ' (max limit=%i)' % max_num_episodes)
        print('   gamma           =', gamma)
        print('   estimated err   =', abserr)
        print('   Error limit     =', max_abserr)
        print('Nterminal episodes =', len(Nterminal_episodes), ' of ',
              len(loop_stateL))

        action_value_coll.summ_print(show_last_change=show_last_change,
                                     fmt_Q=fmt_Q)
        policy.summ_print(environment=environment,
                          verbosity=0,
                          show_env_states=False)

        try:  # sims may not have a layout_print
            environment.layout_print(vname='reward',
                                     fmt=fmt_R,
                                     show_env_states=False,
                                     none_str='*')
        except:
            pass

        print(
            '================== EPSILON GREEDY DEFINED AS ========================'
        )
        eg.summ_print()

        print(
            '================== LEARNING RATE DEFINED AS ========================'
        )
        alpha_obj.summ_print()

    if save_pickle_file:
        policy.save_to_pickle_file(save_pickle_file)
        action_value_coll.save_to_pickle_file(save_pickle_file)

    return policy, action_value_coll  #, steps_per_episodeL, reward_sum_per_episodeL
Пример #3
0
    print('_____________ Value Iteration ________________')
else:
    print('_____________ Policy Iteration ________________')

for gamma in (0.001, 0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99, 0.999):

    if do_VI:
        policy, sv = dp_value_iteration(robot,
                                        do_summ_print=False,
                                        fmt_V='%.1f',
                                        max_iter=1000,
                                        err_delta=0.001,
                                        gamma=gamma)
    else:

        policy = Policy(environment=robot)
        policy.set_policy_from_piD(robot.get_default_policy_desc_dict())

        sv = StateValues(robot)
        sv.init_Vs_to_zero()

        dp_policy_iteration(policy,
                            sv,
                            do_summ_print=False,
                            max_iter=1000,
                            err_delta=0.001,
                            gamma=gamma)

    print('gamma=%5g' % gamma, '  Fallen=', policy.get_single_action('Fallen'),
          '  Moving=', policy.get_single_action('Moving'), '  Standing=',
          policy.get_single_action('Standing'), '  Fallen=',
Пример #4
0
                if show_last_change:
                    print(' Last Delta = %s' %
                          self.last_delta_VsD.get(s_hash, None))
                else:
                    print()


if __name__ == "__main__":  # pragma: no cover

    from introrl.policy import Policy
    from introrl.mdp_data.simple_grid_world import get_gridworld

    gridworld = get_gridworld()
    policyD = gridworld.get_default_policy_desc_dict()

    pi = Policy(environment=gridworld)
    #pi.learn_all_states_and_actions_from_env( gridworld )
    pi.set_policy_from_piD(policyD)

    # -------------

    sv = StateValueColl(gridworld)

    for _ in range(10):
        sv.mc_update((0, 0), 0.2, 2.0)
        sv.mc_update((0, 0), 0.2, 3.0)
        sv.mc_update((0, 1), 0.5, 1.0)
    print('Value at (0,0) is:', sv.get_Vs((0, 0)))
    print('get_biggest_action_state_err = ', sv.get_biggest_action_state_err(),
          '%')
Пример #5
0
from introrl.mc_funcs.mc_fv_prediction import mc_first_visit_prediction
from introrl.black_box_sims.blackjack_sim import BlackJackSimulation
from introrl.policy import Policy
from introrl.agent_supt.state_value_run_ave_coll import StateValueRunAveColl

BJ = BlackJackSimulation()

pi = Policy(environment=BJ)

# default policy is hit on everything except 20 & 21.
pi.set_policy_from_piD(BJ.get_default_policy_desc_dict())

sv = StateValueRunAveColl(BJ)

if 1:
    mc_first_visit_prediction(pi,
                              sv,
                              max_num_episodes=10000,
                              max_abserr=0.001,
                              gamma=1.0)
    sv.save_to_pickle_file(fname='mc_blackjack_10000_eval')
else:
    mc_first_visit_prediction(pi,
                              sv,
                              max_num_episodes=500000,
                              max_abserr=0.001,
                              gamma=1.0)
    sv.save_to_pickle_file(fname='mc_blackjack_500000_eval')
Пример #6
0
from introrl.policy import Policy
from introrl.black_box_sims.racetrack_1_sim import RaceTrack_1

RT = RaceTrack_1()
sca = Policy(environment=RT)

sca.add_state_action((25, 7, 0, 1))

sca.set_action_prob((25, 7, 0, 1), (1, 1), prob=1.0)

#sca.summ_print()

SA = sca.get_SA_object((25, 7, 0, 1))
print(SA)
SA.summ_print()
print('-' * 55)

sca.set_policy_from_piD(RT.get_default_policy_desc_dict())
SA = sca.get_SA_object((25, 7, 0, 1))
print(SA)
SA.summ_print()
Пример #7
0
import matplotlib
import matplotlib.pyplot as plt

from introrl.mc_funcs.mc_ev_prediction import mc_every_visit_prediction
from introrl.policy import Policy
from introrl.agent_supt.state_value_coll import StateValueColl
from introrl.mdp_data.random_walk_mrp import get_random_walk

rw_mrp = get_random_walk()

policy = Policy( environment=rw_mrp )


fig, ax = plt.subplots()

true_valueD = {'A':1.0/6.0, 'B':2.0/6.0, 'C':3.0/6.0, 'D':4.0/6.0, 'E':5.0/6.0}
for alpha in [0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.15]:

    resultLL = [] # a list of result lists
    for loop in range(100): # average rms curves over 100 runs
        sv = StateValueColl( rw_mrp, init_val=0.5 )
        
        resultL, value_snapD = mc_every_visit_prediction( policy, sv,  all_start_states=False,
                                   do_summ_print=False, show_last_change=False,
                                   show_banner=False,
                                   max_episode_steps=1000,
                                   alpha=alpha, const_alpha=True, alpha_half_life=200,
                                   max_num_episodes=100, min_num_episodes=100, max_abserr=0.001, gamma=1.0,
                                   result_list='rms', true_valueD=true_valueD)
        resultLL.append( resultL )
    #print( 'sv.calc_rms_error(true_valueD) =', sv.calc_rms_error(true_valueD) )
Пример #8
0
from introrl.black_box_sims.random_walk_1000 import RandomWalk_1000Simulation
from introrl.agent_supt.episode_maker import make_episode
from introrl.policy import Policy

NUM_EPISODES = 100000
countD = {} # index=state, value=count 

RW = RandomWalk_1000Simulation()
policy = Policy(environment=RW)
policy.intialize_policy_to_equiprobable( env=RW )


for Nepi in range(NUM_EPISODES):
    episode = make_episode(500, policy, RW, max_steps=10000)
    
    for dr in episode.get_rev_discounted_returns( gamma=1.0 ):
        (s_hash, a_desc, reward, sn_hash, G) = dr
        
        countD[ s_hash ] = countD.get( s_hash, 0 ) + 1

SUM_VISITS = sum( list(countD.values()) )
freqL = []
for i in range(1,1001):
    freqL.append( countD.get(i,0) / float(SUM_VISITS) )

# copy and paste list into plot script
print('freqL =', repr(freqL))
from introrl.dp_funcs.dp_policy_iter import dp_policy_iteration
from introrl.policy import Policy
from introrl.state_values import StateValues
from introrl.mdp_data.car_rental_const_rtn import get_env
from introrl.utils import pickle_esp

env = get_env()

policy = Policy(environment=env)
policy.intialize_policy_to_random(env=env)

state_value = StateValues(env)
state_value.init_Vs_to_zero()

dp_policy_iteration(policy,
                    state_value,
                    do_summ_print=True,
                    show_start_policy=True,
                    max_iter=1000,
                    err_delta=0.0001,
                    gamma=0.9)

pickle_esp.save_to_pickle_file(fname='dp_car_rental_PI_const_rtn',
                               env=env,
                               state_values=state_value,
                               policy=policy)

state_value.summ_print(fmt_V='%.1f')

policy.save_diagram(env,
                    inp_colorD={
if 1:
    pi, av = mc_epsilon_greedy(RT,
                               initial_policy='default',
                               first_visit=True,
                               do_summ_print=False,
                               showRunningAve=False,
                               fmt_Q='%g',
                               fmt_R='%g',
                               show_initial_policy=False,
                               max_num_episodes=1000,
                               min_num_episodes=10,
                               max_abserr=0.001,
                               gamma=0.9,
                               iteration_prints=0,
                               max_episode_steps=10000,
                               epsilon=0.1,
                               const_epsilon=True,
                               half_life=500,
                               N_episodes_wo_decay=0)

    pi.save_to_pickle_file('racetrack_2_sim')
else:
    pi = Policy(environment=RT)
    pi.init_from_pickle_file('racetrack_2_sim')

fig, ax = plt.subplots()
RT.plot_policy(ax, pi)

plt.show()
fig.savefig("racetrack_2_sim.png")
Пример #11
0
from introrl.dp_funcs.dp_policy_eval import dp_policy_evaluation
from introrl.policy import Policy
from introrl.state_values import StateValues
from introrl.mdp_data.sutton_ex4_1_grid import get_gridworld

gridworld = get_gridworld()

pi = Policy(environment=gridworld)
pi.intialize_policy_to_equiprobable(env=gridworld)

sv = StateValues(gridworld)
sv.init_Vs_to_zero()

dp_policy_evaluation(pi,
                     sv,
                     max_iter=1000,
                     err_delta=0.001,
                     gamma=1.,
                     fmt_V='%.1f')

#sv.summ_print( fmt_V='%.3f', show_states=False )
pi.summ_print(environment=gridworld, verbosity=0, show_env_states=False)

#print( gridworld.get_info() )
Пример #12
0
def mc_exploring_starts(environment,
                        initial_policy='default',
                        read_pickle_file='',
                        save_pickle_file='',
                        first_visit=True,
                        do_summ_print=True,
                        showRunningAve=False,
                        fmt_Q='%g',
                        fmt_R='%g',
                        show_initial_policy=True,
                        max_num_episodes=1000,
                        min_num_episodes=10,
                        max_abserr=0.001,
                        gamma=0.9,
                        max_episode_steps=10000,
                        iteration_prints=0):
    """
    ... GIVEN AN ENVIRONMENT ... 
    apply Monte Carlo Exploring Starts to find the OPTIMAL POLICY
    
    initial_policy can be 'default', 'random', policy_dictionary, Policy object
    
    Returns: Policy and ActionValueRunAveColl objects
    
    Use Episode Discounted Returns to find Q(s,a), Action-Value Function
    
    Terminates when abserr < max_abserr
    
    Assume that Q(s,a), action_value_ave, has been initialized prior to call.
    
    Assume environment attached to policy will have method "get_any_action_state_hash"
    in order to begin at any action state.
    
    CREATES BOTH policy AND action_value OBJECTS.
    """

    # create Policy and ActionValueRunAveColl objects
    policy = Policy(environment=environment)
    if initial_policy == 'default':
        print('Initializing Policy to "default" in mc_exploring_starts')
        policy.learn_a_legal_action_from_env(env=environment)
        policy.set_policy_from_piD(environment.get_default_policy_desc_dict())
    elif initial_policy == 'random':
        print('Initializing Policy to "random" in mc_exploring_starts')
        policy.intialize_policy_to_random(env=environment)
    elif isinstance(initial_policy, Policy):
        policy = initial_policy
    else:
        print('Initializing Policy to "custom policy" in mc_exploring_starts')
        policy.learn_a_legal_action_from_env(env=environment)
        policy.set_policy_from_piD(initial_policy)

    action_value_ave = ActionValueRunAveColl(environment)
    action_value_ave.init_Qsa_to_zero(
    )  # Terminal states w/o an action are NOT included
    #action_value_ave.summ_print()

    if read_pickle_file:
        policy.init_from_pickle_file(read_pickle_file)
        action_value_ave.init_from_pickle_file(read_pickle_file)

    if do_summ_print:
        if show_initial_policy:
            print(
                '=============== STARTING WITH THE INITIAL POLICY ===================='
            )
            policy.summ_print(verbosity=0,
                              environment=environment,
                              show_env_states=False,
                              none_str='*')


    s = 'Starting a Maximum of %i Monte Carlo Exploring Start Episodes\nfor "%s" with Gamma = %g'%\
        (max_num_episodes, environment.name, gamma)
    banner(s, banner_char='', leftMargin=0, just='center')

    # create an Episode object for getting returns
    episode = Episode(environment.name + ' Episode')

    # set counter and flag
    num_episodes = 0
    keep_looping = True

    progress_str = ''
    while (num_episodes <= max_num_episodes - 1) and keep_looping:

        keep_looping = False
        abserr = 0.0  # calculated below as part of termination criteria

        for start_hash in environment.iter_all_action_states(randomize=True):
            a_descL = environment.get_state_legal_action_list(start_hash)
            # randomize action order
            random.shuffle(a_descL)
            # try every initial action for each start_hash
            for a_desc in a_descL:

                # break from inner loop if max_num_episodes is hit.
                if num_episodes >= max_num_episodes:
                    break

                make_episode(start_hash,
                             policy,
                             environment,
                             environment.terminal_set,
                             episode=episode,
                             first_a_desc=a_desc,
                             max_steps=max_episode_steps,
                             eps_greedy=None)

                num_episodes += 1

                for dr in episode.get_rev_discounted_returns(
                        gamma=gamma, first_visit=first_visit, visit_type='SA'):
                    # look at each step from episode and calc average Q(s,a)
                    (s, a, r, sn, G) = dr
                    action_value_ave.add_val(s, a, G)

                    aL = environment.get_state_legal_action_list(s)
                    if aL:
                        best_a_desc, best_a_val = aL[0], float('-inf')
                        bestL = [best_a_desc]
                        for a in aL:
                            q = action_value_ave.get_ave(s, a)
                            if q > best_a_val:
                                best_a_desc, best_a_val = a, q
                                bestL = [a]
                            elif q == best_a_val:
                                bestL.append(a)
                        best_a_desc = random.choice(bestL)
                        policy.set_sole_action(s, best_a_desc)

        abserr = action_value_ave.get_biggest_action_state_err()
        if abserr > max_abserr:
            keep_looping = True

        if num_episodes < min_num_episodes:
            keep_looping = True  # must loop for min_num_episodes at least

        pc_done = 100.0 * float(num_episodes) / float(max_num_episodes)
        out_str = '%3i%%' % (5 * (int(pc_done / 5.0)))
        if out_str != progress_str:
            score = environment.get_policy_score(policy=policy,
                                                 start_state_hash=None,
                                                 step_limit=1000)
            print(out_str, ' score=%s' % str(score),
                  ' = (r_sum, n_steps, msg)', '   estimated err =', abserr)
            progress_str = out_str

    if do_summ_print:
        s = ''
        if num_episodes >= max_num_episodes:
            s = '   (NOTE: STOPPED ON MAX-ITERATIONS)'

        print('Exited MC First-Visit Value Iteration', s)
        print('   num episodes   =', num_episodes,
              ' (min limit=%i)' % min_num_episodes,
              ' (max limit=%i)' % max_num_episodes)
        print('   gamma          =', gamma)
        print('   estimated err  =', abserr)
        print('   Error limit    =', max_abserr)

        action_value_ave.summ_print(showRunningAve=showRunningAve, fmt_Q=fmt_Q)
        policy.summ_print(environment=environment,
                          verbosity=0,
                          show_env_states=False)

        try:  # sims may not have a layout_print
            environment.layout_print(vname='reward',
                                     fmt=fmt_R,
                                     show_env_states=False,
                                     none_str='*')
        except:
            pass

    if save_pickle_file:
        policy.save_to_pickle_file(save_pickle_file)
        action_value_ave.save_to_pickle_file(save_pickle_file)

    return policy, action_value_ave
Пример #13
0
        print('  --> Final Policy AFTER POLICY ITERATION <--')
        policy.summ_print(environment=state_value.environment,
                          verbosity=0,
                          show_env_states=False)


if __name__ == "__main__":  # pragma: no cover
    import sys
    from introrl.policy import Policy
    from introrl.state_values import StateValues
    from introrl.dp_funcs.dp_policy_eval import dp_policy_evaluation
    from introrl.mdp_data.simple_grid_world import get_gridworld

    gridworld = get_gridworld()
    pi = Policy(environment=gridworld)

    #pi.intialize_policy_to_equiprobable(env=gridworld)
    pi.intialize_policy_to_random(env=gridworld)
    #pi.learn_all_states_and_actions_from_env( gridworld )

    #pi.set_policy_from_piD( gridworld.get_default_policy_desc_dict() )

    # change one action from gridworld default
    pi.set_sole_action((1, 0), 'D')  # is 'U' in default

    sv = StateValues(gridworld)
    sv.init_Vs_to_zero()

    dp_policy_iteration(pi,
                        sv,
Пример #14
0
def dp_value_iteration(environment,
                       allow_multi_actions=False,
                       do_summ_print=True,
                       fmt_V='%g',
                       fmt_R='%g',
                       max_iter=1000,
                       err_delta=0.001,
                       gamma=0.9,
                       iteration_prints=0):
    """
    ... GIVEN AN ENVIRONMENT ... 
    apply Value Iteration to find the OPTIMAL POLICY
    
    Returns: policy and state_value objects
    
    Terminates when delta < err_delta * VI_STOP_CRITERIA
    
    CREATES BOTH policy AND state_value OBJECTS.
    
    If allow_multi_actions is True, policy will include all actions 
    within err_delta of best action.
    """

    # create Policy and StateValues objects
    policy = Policy(environment=environment)
    policy.intialize_policy_to_random(env=environment)

    state_value = StateValues(environment)
    state_value.init_Vs_to_zero()  # Terminal states need to be 0.0
    #state_value.summ_print()

    # set counter and flag
    loop_counter = 0
    all_done = False

    # value-iteration stopping criteria
    # if gamme==1.0 value iteration will never stop SO limit to gamma==0.999 stop criteria
    #  (VI terminates if delta < err_delta * VI_STOP_CRITERIA)
    #  (typically err_delta = 0.001)

    VI_STOP_CRITERIA = max((1.0 - gamma) / gamma, (1.0 - 0.999) / 0.999)
    error_limit = err_delta * VI_STOP_CRITERIA

    while (loop_counter < max_iter) and (not all_done):
        loop_counter += 1
        all_done = True
        delta = 0.0  # used to calc largest change in state_value

        for s_hash in policy.iter_all_policy_states():
            VsD = {
            }  # will hold: index=a_desc, value=V(s) for all transitions of a_desc from s_hash

            # MUST include currently zero prob actions
            for a_desc, a_prob in policy.iter_policy_ap_for_state(
                    s_hash, incl_zero_prob=True):
                calcd_v = 0.0

                for sn_hash, t_prob, reward in \
                    environment.iter_next_state_prob_reward(s_hash, a_desc, incl_zero_prob=False):

                    calcd_v += t_prob * (reward + gamma * state_value(sn_hash))

                VsD[a_desc] = calcd_v

            best_a_desc, best_a_val = argmax_vmax_dict(VsD)
            delta = max(delta, abs(best_a_val - state_value(s_hash)))
            state_value[s_hash] = best_a_val

        if delta > error_limit:
            all_done = False

        if iteration_prints and (loop_counter % iteration_prints == 0):
            print('Loop:%6i' % loop_counter, '  delta=%g' % delta)

    # Now that State-Values have been determined, set policy
    for s_hash in policy.iter_all_policy_states():
        VsD = {
        }  # will hold: index=a_desc, value=V(s) for all transitions of a_desc from s_hash

        # MUST include zero prob actions
        for a_desc, a_prob in policy.iter_policy_ap_for_state(
                s_hash, incl_zero_prob=True):
            calcd_v = 0.0

            for sn_hash, t_prob, reward in \
                environment.iter_next_state_prob_reward(s_hash, a_desc, incl_zero_prob=False):

                calcd_v += t_prob * (reward + gamma * state_value(sn_hash))

            VsD[a_desc] = calcd_v

        if allow_multi_actions:
            best_a_list, best_a_val = multi_argmax_vmax_dict(
                VsD, err_delta=err_delta)

            policy.set_sole_action(s_hash,
                                   best_a_list[0])  # zero all other actions
            prob = 1.0 / len(best_a_list)
            for a_desc in best_a_list:
                policy.set_action_prob(s_hash, a_desc, prob=prob)
        else:
            best_a_desc, best_a_val = argmax_vmax_dict(VsD)
            policy.set_sole_action(s_hash, best_a_desc)

    if do_summ_print:
        s = ''
        if loop_counter >= max_iter:
            s = '   (NOTE: STOPPED ON MAX-ITERATIONS)'

        print('Exited Value Iteration', s)
        print('   iterations     =', loop_counter, ' (limit=%i)' % max_iter)
        print('   measured delta =', delta)
        print('   gamma          =', gamma)
        print('   err_delta      =', err_delta)
        print('   error limit    =', error_limit)
        print('   STOP CRITERIA  =', VI_STOP_CRITERIA)

        state_value.summ_print(fmt_V=fmt_V)
        policy.summ_print(environment=environment,
                          verbosity=0,
                          show_env_states=False)

        environment.layout_print(vname='reward',
                                 fmt=fmt_R,
                                 show_env_states=False,
                                 none_str='*')

    return policy, state_value
Пример #15
0
from introrl.dp_funcs.dp_policy_iter import dp_policy_iteration
from introrl.policy import Policy
from introrl.state_values import StateValues
from introrl.mdp_data.car_rental import get_env
from introrl.utils import pickle_esp

env = get_env()

policy = Policy(environment=env)
policy.intialize_policy_to_random(env=env)

state_value = StateValues(env)
state_value.init_Vs_to_zero()

dp_policy_iteration(policy,
                    state_value,
                    do_summ_print=True,
                    show_start_policy=True,
                    max_iter=1000,
                    err_delta=0.0001,
                    gamma=0.9)

diag_colorD = {
    '5': 'r',
    '4': 'g',
    '3': 'b',
    '2': 'c',
    '1': 'y',
    '0': 'w',
    '-5': 'r',
    '-4': 'g',
Пример #16
0
class MyTest(unittest.TestCase):
    def setUp(self):
        unittest.TestCase.setUp(self)
        self.gridworld = get_gridworld()
        self.P = Policy(environment=self.gridworld)
        self.P.intialize_policy_to_equiprobable(env=self.gridworld)

    def tearDown(self):
        unittest.TestCase.tearDown(self)
        del (self.P)

    def test_should_always_pass_cleanly(self):
        """Should always pass cleanly."""
        pass

    def test_myclass_existence(self):
        """Check that myclass exists"""

        # See if the self.P object exists
        self.assertIsInstance(self.P, Policy, msg=None)

    def test_set_policy_from_default_pi(self):
        """test set policy from default pi"""

        policyD = self.gridworld.get_default_policy_desc_dict()
        self.P.set_policy_from_piD(policyD)

        self.assertEqual(self.P.get_action_prob((2, 2), 'U'), 1.0)
        self.assertEqual(self.P.get_action_prob((2, 2), 'R'), 0.0)
        self.assertEqual(self.P.get_action_prob((2, 2), 'D'), None)

    #def test_set_policy_from_list_of_actions(self):
    #    """test set policy from list of actions"""
    #    piD = {(0, 0):('R','D') }
    #    self.P.set_policy_from_piD( piD )

    #    self.assertEqual(self.P.get_action_prob( (0,0), 'U'), None)
    #    self.assertEqual(self.P.get_action_prob( (0,0), 'R'), 0.5)
    #    self.assertEqual(self.P.get_action_prob( (0,0), 'D'), 0.5)

    #def test_set_policy_from_list_of_action_probs(self):
    #    """test set policy from list of action probs"""
    #    piD = {(0, 0):[('R',0.6), ('D',0.4)] }
    #    self.P.set_policy_from_piD( piD )

    #    self.assertEqual(self.P.get_action_prob( (0,0), 'U'), None)
    #    self.assertEqual(self.P.get_action_prob( (0,0), 'R'), 0.6)
    #    self.assertEqual(self.P.get_action_prob( (0,0), 'D'), 0.4)

    #    # make (action, prob) entry too long.
    #    with self.assertRaises(ValueError):
    #        piD = {(0, 0):[('R',0.6,0.4), ('D',0.4,0.6)] }
    #        self.P.set_policy_from_piD( piD )

    def test_learn_all_s_and_a(self):
        """test learn all s and a"""

        self.P.learn_all_states_and_actions_from_env(self.gridworld)

    def test_initialize_to_random(self):
        """test initialize to random"""

        self.P.intialize_policy_to_random(env=self.gridworld)
        apL = self.P.get_list_of_all_action_desc_prob((0, 2),
                                                      incl_zero_prob=True)
        pL = [p for (adesc, p) in apL]
        self.assertEqual(sorted(pL), [0.0, 0.0, 1.0])

    def test_iterate_adesc_p(self):
        """test iterate adesc p"""

        apL = []
        for (a_desc, p) in self.P.iter_policy_ap_for_state(
            (0, 0), incl_zero_prob=False):
            apL.append((a_desc, p))

        self.assertIn(('R', 0.5), apL)
        self.assertIn(('D', 0.5), apL)
        self.assertNotIn(('U', 0.5), apL)

    def test_iterate_all_states(self):
        """test iterate all states"""

        sL = []
        for s_hash in self.P.iter_all_policy_states():
            sL.append(s_hash)
        sL.sort()
        self.assertEqual(len(sL), 9)
        self.assertEqual(sL[0], (0, 0))
        self.assertEqual(sL[-1], (2, 3))

    def test_get_single_action(self):
        """test get single action"""
        a_desc = self.P.get_single_action((0, 0))
        self.assertIn(a_desc, ('R', 'D'))

        a_desc = self.P.get_single_action((99, 99))
        self.assertEqual(a_desc, None)
Пример #17
0
                                     a_desc=self.A[self.tau],
                                     delta=delta)


if __name__ == "__main__":  # pragma: no cover

    from introrl.mdp_data.simple_grid_world import get_gridworld
    from introrl.policy import Policy
    from introrl.agent_supt.epsilon_calc import EpsilonGreedy
    from introrl.agent_supt.episode_maker import make_episode
    from introrl.agent_supt.action_value_coll import ActionValueColl

    gridworld = get_gridworld()
    sv = ActionValueColl(gridworld)

    pi = Policy(environment=gridworld)

    pi.set_policy_from_piD(gridworld.get_default_policy_desc_dict())
    #pi.summ_print()

    eg = EpsilonGreedy(epsilon=0.5,
                       const_epsilon=True,
                       half_life=200,
                       N_episodes_wo_decay=0)

    episode_obj = make_episode((2, 0), pi, gridworld, eps_greedy=None)
    """environment, Nsteps=3, 
                 policy=None, episode_obj=None, 
                 terminal_set=None,
                 max_steps=sys.maxsize, eps_greedy=None"""
Пример #18
0
 def setUp(self):
     unittest.TestCase.setUp(self)
     self.gridworld = get_gridworld()
     self.P = Policy(environment=self.gridworld)
     self.P.intialize_policy_to_equiprobable(env=self.gridworld)
Пример #19
0
eps_obj.set_half_life_for_N_episodes(Nepisodes=NUM_EPISODES,
                                     epsilon_final=0.16666666666666)

agent = SA_SemiGradAgent(environment=gridworld,
                         update_type='qlearn',
                         sa_linear_function=LazyProgrammerMaze(gridworld),
                         learn_tracker=learn_tracker,
                         gamma=0.9,
                         alpha=alpha_obj,
                         epsilon=eps_obj)

for i in range(NUM_EPISODES):
    agent.run_episode((2, 0))
print()

agent.summ_print()
print('-' * 77)
#learn_tracker.summ_print()
#print('-'*77)

agent.action_value_linfunc.summ_print(fmt_Q='%.4f')
print('-' * 77)

policy = Policy(environment=gridworld)
for s_hash in gridworld.iter_all_action_states():
    a_desc = agent.action_value_linfunc.get_best_eps_greedy_action(
        s_hash, epsgreedy_obj=None)
    policy.set_sole_action(s_hash, a_desc)

policy.summ_print(environment=gridworld, verbosity=0)
Пример #20
0
from introrl.policy import Policy
from introrl.black_box_sims.racetrack_1_sim import RaceTrack_1
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

RT = RaceTrack_1()
#pi = Policy( environment=RT )

policyD = Policy().read_pickle_file('racetrack_1_sim')

#pi.init_from_pickle_file( 'racetrack_1_sim' )

fig, ax = plt.subplots()

for (j, i) in RT.racetrack_area:
    rect = mpatches.Rectangle((i - .5, j - .5),
                              1.0,
                              1.0,
                              ec="none",
                              color='blue',
                              alpha=0.3)
    ax.add_patch(rect)

for (j, i, _, _) in RT.starting_lineL:
    rect = mpatches.Rectangle((i - .5, j - .5),
                              1.0,
                              1.0,
                              ec="none",
                              color='yellow',
                              alpha=1.)
Пример #21
0
import matplotlib.pyplot as plt
from introrl.td_funcs.td0_prediction import td0_prediction
from introrl.utils.running_ave import RunningAve
from introrl.mc_funcs.mc_ev_prediction import mc_every_visit_prediction
from introrl.policy import Policy
from introrl.agent_supt.state_value_coll import StateValueColl
from introrl.agent_supt.nstep_td_eval_walker import NStepTDWalker
from introrl.mdp_data.random_walk_generic_mrp import get_random_walk
from introrl.agent_supt.episode_maker import make_episode

GAMMA=1.0

AVE_OVER = 100

rw_mrp = get_random_walk(Nside_states=9, win_reward=1.0, lose_reward=-1.0, step_reward=0.0)
policy = Policy( environment=rw_mrp )

policy.intialize_policy_to_equiprobable() # should be equiprobable from above init already

episode_obj = make_episode( 'C', policy, rw_mrp )

fig, ax = plt.subplots()

# ---------------- set up true value data for RMS calc --------------------
true_valueD = {'C':0.0} # { 'Win':0.0, 'Lose':0.0}

#print('rw_mrp.get_num_states() = ',rw_mrp.get_num_states())
delta = 2.0 / (rw_mrp.get_num_states()-1)
Nsides = int( rw_mrp.get_num_states() / 2) - 1
d = 0.0
for i in range(1, Nsides+1 ):
Пример #22
0
from introrl.mdp_data.slippery_cleaning_robot import get_robot

gridworld = get_robot()

if 1:
    policy, state_value = dp_value_iteration( gridworld, do_summ_print=True,fmt_V='%.3f',
                                              max_iter=1000, err_delta=0.001, 
                                              gamma=1.0)
    
    print('_'*55)
    score = gridworld.get_policy_score( policy, start_state_hash=None, step_limit=1000)
    print('Policy Score =', score, ' = (r_sum, n_steps, msg)')

else:

    pi = Policy( environment=gridworld )
    pi.set_policy_from_piD( gridworld.get_default_policy_desc_dict() )

    sv = StateValues( gridworld )
    sv.init_Vs_to_zero()

    dp_policy_evaluation( pi, sv, max_iter=1000, err_delta=0.001, gamma=.985)

    #sv.summ_print( fmt_V='%.3f', show_states=False )
    pi.summ_print(  environment=gridworld, verbosity=0, show_env_states=False  )



print( gridworld.get_info() )