def init_Qsa_to_zero(self):
        # initialize to 0.0 for all states, terminal and non-terminal.
        for s_hash in self.environment.iter_all_states():
            if s_hash not in self.Qsa_RaveD:
                self.Qsa_RaveD[s_hash] = {}

            # may not be any actions in terminal state, so set None action.
            if s_hash in self.environment.terminal_set:
                self.Qsa_RaveD[s_hash][None] = RunningAve(name=str(s_hash) +
                                                          ' None')

            aL = self.environment.get_state_legal_action_list(s_hash)
            for a_desc in aL:
                self.Qsa_RaveD[s_hash][a_desc] = RunningAve(name=str(s_hash) +
                                                            ' ' + str(a_desc))
示例#2
0
    def save_action_results(self, a_desc, sn_hash, reward_val, 
                            force_deterministic=False):
        """
        Add sn_hash to possible next states and add to its RunningAve
        If force_deterministic is True, force the new sn_hash to be unique
        """
        
        # make sure that a_desc is initialized
        if a_desc not in self.action_countD:
            self.add_action( a_desc )
        
        # increment action counters
        self.action_countD[ a_desc ] += 1 # inc. count of a_desc calls
        self.total_action_calls += 1

        # make sure sn_hash dict is initialized for a_desc
        if a_desc not in self.action_sn_rD:
            self.action_sn_rD[ a_desc ] = {} 
            # snD... index=sn_hash: value=RunningAve of Reward
        
        # save sn_hash and update reward running average for (a_desc, sn_hash)
        if sn_hash not in self.action_sn_rD[ a_desc ]:
            self.action_sn_rD[ a_desc ][ sn_hash ] = \
                RunningAve( name= 'Reward (%s, %s, %s)'%(str(self.s_hash), str(a_desc), str(sn_hash)) )
                    
        # update the RunningAve of (a_desc, sn_hash) with current reward_val
        self.action_sn_rD[ a_desc ][sn_hash].add_val( reward_val )
        
        if force_deterministic and (len(self.action_sn_rD[ a_desc ])>1):
            # remove any sn_hash other than the current input sn_hash
            D = {sn_hash: self.action_sn_rD[ a_desc ][sn_hash]}
            self.action_sn_rD[ a_desc ] = D
            self.action_sn_rD[ a_desc ][sn_hash].set_all_attrib( 1, reward_val, reward_val, reward_val)
示例#3
0
def get_td0_data():
    if 'TD0_raveL' in dataD:
        TD0_raveL = dataD['TD0_raveL']
        Nruns = TD0_raveL[0].num_val
        print(Nruns, ' of TD0_raveL found')
    else:
        TD0_raveL = []
        Nruns = 0

    for loop in range(Nruns, RUN_COUNT):

        learn_tracker.clear()
        policy, state_value = \
            td0_epsilon_greedy( CW,   learn_tracker=learn_tracker,
                                  initial_Vs=0.0, # init non-terminal_set of V(s) (terminal_set=0.0)
                                  use_list_of_start_states=False, # use list OR single start state of environment.
                                  do_summ_print=False, show_last_change=False, fmt_V='%g', fmt_R='%g',
                                  pcent_progress_print=0,
                                  show_banner = False,
                                  max_num_episodes=500, min_num_episodes=10, max_abserr=0.001,
                                  gamma=1.0,
                                  max_episode_steps=1000,
                                  epsilon=EPSILON,
                                  alpha=ALPHA)
        reward_sum_per_episodeL = learn_tracker.reward_sum_per_episode()

        while len(reward_sum_per_episodeL) > len(TD0_raveL):
            TD0_raveL.append(RunningAve())
        for R, r in zip(TD0_raveL, reward_sum_per_episodeL):
            R.add_val(r)
    dataD['TD0_raveL'] = TD0_raveL
    save_to_pickle()
示例#4
0
    def get_estimated_rewards(self):
        """
        Return a dictionary of estimated rewards for each state.
        AND a dictionary of any special message
        (Will be exact for deterministic environment)
        """
        est_rD = {}  # index=s_hash, value=float reward estimate.
        msgD = {}  # index=s_hash, value=any special message

        # initialize all rewards to zero for all states.
        for S in self.SC.iter_states():
            est_rD[S.hash] = RunningAve(S.hash)

        for s_hash, a_desc, T in self.TC.iter_all_transitions():
            for sn_hash, t_prob, reward in T.iter_sn_hash_prob_reward():
                Robj = T.get_reward_obj(sn_hash)

                if Robj.reward_type == CONST:
                    est_rD[sn_hash].add_val(reward)

                else:
                    msgD[sn_hash] = 'est'
                    # if the reward is stochastic, average 100 values
                    for i in range(100):
                        est_rD[sn_hash].add_val(Robj())

        # Need to convert RunningAve objects to float
        for (s_hash, RA) in est_rD.items():
            est_rD[s_hash] = RA.get_ave()
            #print(s_hash, RA)

        return est_rD, msgD
示例#5
0
def get_expected_sarsa_data():
    if 'ExpSarsa_raveD' in dataD:
        ExpSarsa_raveD = dataD['ExpSarsa_raveD']
        ave_run_time = dataD['ExpSarsa_ave_run_time']
    else:
        ExpSarsa_raveD = {}
        ave_run_time = RunningAve()
        for alpha in ALPHA_LIST:
            ExpSarsa_raveD[alpha] = [RunningAve(), RunningAve()]

    Nruns = ExpSarsa_raveD[0.1][0].num_val
    print(Nruns, ' of ExpSarsa_raveD found')

    for loop in range(Nruns, RUN_COUNT):
        for alpha in ALPHA_LIST:

            start_time = time.time()
            learn_tracker.clear()
            policy, state_value = \
                expected_sarsa_eps_greedy( CW,  learn_tracker=learn_tracker,
                                      initial_Qsa=0.0, # init non-terminal_set of V(s) (terminal_set=0.0)
                                      use_list_of_start_states=False, # use list OR single start state of environment.
                                      do_summ_print=False, show_last_change=False, fmt_Q='%g', fmt_R='%g',
                                      pcent_progress_print=0,
                                      show_banner = False,
                                      max_num_episodes=1000, min_num_episodes=1000, max_abserr=0.000001,
                                      gamma=1.0,
                                      max_episode_steps=10000,
                                      epsilon=EPSILON,
                                      alpha=alpha)

            reward_sum_per_episodeL = learn_tracker.reward_sum_per_episode()

            ave_run_time.add_val(time.time() -
                                 start_time)  # compute average run time
            ExpSarsa_raveD[alpha][0].add_val(
                sum(reward_sum_per_episodeL[:100]) / 100.0)
            ExpSarsa_raveD[alpha][1].add_val(
                sum(reward_sum_per_episodeL) / 1000.0)

        print('.', end='')
    print('ExpSarsa_ave_run_time = ', ave_run_time.get_ave())

    dataD['ExpSarsa_raveD'] = ExpSarsa_raveD
    dataD['ExpSarsa_ave_run_time'] = ave_run_time
    save_to_pickle('ExpSarsa_raveD', 'ExpSarsa_ave_run_time')
示例#6
0
#print('true_valueD =',true_valueD)
#sys.exit()

# ----------------------------------------- generate data -------------
alphaL = [0.05*n for n in range(21)]
nstepL = [1,2,4,8, 16, 32]

nstep_walkerL = []
ave_rms_aveD = {} # index=(alpha, Nsteps), value=RunningAve
sv_collD = {} # index=(alpha, Nsteps), value=StateValueColl

# create data structures
for Nsteps in nstepL:
    nstep_walkerL.append( NStepTDWalker(rw_mrp, Nsteps=Nsteps,  episode_obj=episode_obj) )
    for alpha in alphaL:
        ave_rms_aveD[ (alpha, Nsteps) ] = RunningAve()
        sv_collD[ (alpha, Nsteps) ] = StateValueColl( rw_mrp, init_val=0.0 )

# begin main loop over runs
for loop in range(AVE_OVER): # average rms curves over AVE_OVER runs
    if loop%10==0:
        print(loop, end='')
    else:
        print('.', end='')
    
    # set state variables to 0.0
    for Nsteps in  nstepL :
        for  alpha in alphaL:
            sv_collD[ (alpha, Nsteps) ].init_Vs_to_val( 0.0 )

            # get the initial RMS 
        expected_sarsa_eps_greedy( CW, learn_tracker=learn_tracker,
                              initial_Qsa=0.0, # init non-terminal_set of V(s) (terminal_set=0.0)
                              use_list_of_start_states=False, # use list OR single start state of environment.
                              do_summ_print=False, show_last_change=False, fmt_Q='%g', fmt_R='%g',
                              show_banner = False,
                              pcent_progress_print=0,
                              max_num_episodes=500, min_num_episodes=10, max_abserr=0.001,
                              gamma=1.0,
                              max_episode_steps=1000,
                              epsilon=EPSILON,
                              alpha=ALPHA)

    reward_sum_per_episodeL_es = learn_tracker.reward_sum_per_episode()

    while len(reward_sum_per_episodeL_es) > len(ExpSarsa_raveL):
        ExpSarsa_raveL.append(RunningAve())
    for R, r in zip(ExpSarsa_raveL, reward_sum_per_episodeL_es):
        R.add_val(r)

    learn_tracker.clear()
    policy_t, state_value_t = \
        td0_epsilon_greedy( CW,  learn_tracker=learn_tracker,
                              initial_Vs=0.0, # init non-terminal_set of V(s) (terminal_set=0.0)
                              use_list_of_start_states=False, # use list OR single start state of environment.
                              do_summ_print=False, show_last_change=False, fmt_V='%g', fmt_R='%g',
                              show_banner = False,
                              pcent_progress_print=0,
                              max_num_episodes=500, min_num_episodes=10, max_abserr=0.001,
                              gamma=1.0,
                              max_episode_steps=1000,
                              epsilon=EPSILON,
示例#8
0
import matplotlib
import matplotlib.pyplot as plt

from introrl.mc_funcs.mc_ev_prediction import mc_every_visit_prediction
from introrl.policy import Policy
from introrl.agent_supt.state_value_coll import StateValueColl
from introrl.mdp_data.random_walk_mrp import get_random_walk
from introrl.agent_supt.episode_maker import make_episode
from introrl.utils.running_ave import RunningAve

rw_mrp = get_random_walk()
policy = Policy( environment=rw_mrp )

NumEpisodes = 100
mc_rms_raveL = [RunningAve(name='%i'%i) for i in range(NumEpisodes)]
td_rms_raveL = [RunningAve(name='%i'%i) for i in range(NumEpisodes)]

alpha_td = 0.1
alpha_mc = 0.02
gamma = 1.0

true_valueD = {'A':1.0/6.0, 'B':2.0/6.0, 'C':3.0/6.0, 'D':4.0/6.0, 'E':5.0/6.0}

for o_loop in range(1,101):
    print('%2i'%o_loop, end=' ')
    if o_loop % 20 == 0:
        print()
            
    # make 2 state value objects.
    sv_td = StateValueColl( rw_mrp, init_val=0.5 )
    sv_mc = StateValueColl( rw_mrp, init_val=0.5 )
 def init_Vs_to_zero(self):
     # initialize to 0.0 for all states, terminal and non-terminal.
     for s_hash in self.environment.iter_all_states():
         self.Vs_RaveD[s_hash] = RunningAve(name=s_hash)
        sarsa_epsilon_greedy( CW,  learn_tracker=learn_tracker,
                              initial_Qsa=0.0, # init non-terminal_set of V(s) (terminal_set=0.0)
                              use_list_of_start_states=False, # use list OR single start state of environment.
                              do_summ_print=False, show_last_change=False, fmt_Q='%g', fmt_R='%g',
                              pcent_progress_print=0,
                              show_banner = False,
                              max_num_episodes=500, min_num_episodes=10, max_abserr=0.001, 
                              gamma=1.0,
                              max_episode_steps=1000,
                              epsilon=EPSILON, 
                              alpha=ALPHA)
                              
    reward_sum_per_episodeL_s = learn_tracker.reward_sum_per_episode()

    while len(reward_sum_per_episodeL_s) > len(Sarsa_raveL):
        Sarsa_raveL.append( RunningAve() )
    for R,r in zip(Sarsa_raveL,  reward_sum_per_episodeL_s):
        R.add_val( r )
    
    
    learn_tracker.clear()
    policy_q, state_value_q = \
        qlearning_epsilon_greedy( CW,  learn_tracker=learn_tracker,
                              initial_Qsa=0.0, # init non-terminal_set of V(s) (terminal_set=0.0)
                              use_list_of_start_states=False, # use list OR single start state of environment.
                              do_summ_print=False, show_last_change=False, fmt_Q='%g', fmt_R='%g',
                              pcent_progress_print=0,
                              show_banner = False,
                              max_num_episodes=500, min_num_episodes=10, max_abserr=0.001, 
                              gamma=1.0,
                              max_episode_steps=1000,
    maze_q.open_gate_R()
    maze_q.close_gate_L()

    # DynaQ+ episodes
    while agent_qp.model.total_action_calls < 3000:
        if agent_qp.model.total_action_calls >= 1000:
            maze_q.open_gate_L()
            maze_q.close_gate_R()

        agent_qp.run_episode('Start', Nplanning_loops=PLAN_LOOPS)

    cum_rew_qL = learn_tracker_q.cum_reward_per_step()
    cum_rew_qpL = learn_tracker_qp.cum_reward_per_step()

    while len(q_raveL) < min(3000, len(cum_rew_qL)):
        q_raveL.append(RunningAve())
    for i, r in enumerate(cum_rew_qL):
        if i < 3000:
            q_raveL[i].add_val(r)

    while len(qp_raveL) < min(3000, len(cum_rew_qpL)):
        qp_raveL.append(RunningAve())
    for i, r in enumerate(cum_rew_qpL):
        if i < 3000:
            qp_raveL[i].add_val(r)

#agent_q.model.summ_print(long=True)
#sys.exit()
agent_q.action_value_coll.summ_print(fmt_Q='%.3f',
                                     none_str='*',
                                     show_states=True,
import sys
import matplotlib
import matplotlib.pyplot as plt

from introrl.mc_funcs.mc_ev_prediction import mc_every_visit_prediction
from introrl.policy import Policy
from introrl.agent_supt.state_value_coll import StateValueColl
from introrl.mdp_data.random_walk_mrp import get_random_walk
from introrl.agent_supt.episode_maker import make_episode
from introrl.utils.running_ave import RunningAve

rw_mrp = get_random_walk()
policy = Policy( environment=rw_mrp )

NumEpisodes = 100
mc_rms_raveL = [RunningAve(name='%i'%i) for i in range(NumEpisodes)]
td_rms_raveL = [RunningAve(name='%i'%i) for i in range(NumEpisodes)]

alpha = 0.1
gamma = 1.0

true_valueD = {'A':1.0/6.0, 'B':2.0/6.0, 'C':3.0/6.0, 'D':4.0/6.0, 'E':5.0/6.0}
    
def calc_td_error(show_values=True):
    errD = {} # index=s_hash, value=Vtarget - V(s)
    for s_hash in ['A','B','C','D','E']:
        errD[s_hash] = 0.0
        
    for (s_hash,sn_hash), R in td_averD.items():
        errD[s_hash] += R.get_ave() + gamma*sv_td.get_Vs(sn_hash) - sv_td.get_Vs(s_hash)
    
示例#13
0
fig, ax = plt.subplots()

# ---------------- set up true value data for RMS calc --------------------
true_valueD = {'C': 0.0, 'Win': 0.0, 'Lose': 0.0}

delta = 2.0 / (rw_mrp.get_num_states() + 1)
Nsides = int(rw_mrp.get_num_states() / 2) - 1
d = 0.0
for i in range(1, Nsides + 1):
    d += delta
    true_valueD = {'L-%i' % i: -d}
    true_valueD = {'R+%i' % i: d}

# ----------------------------------------- generate TD(0) data -------------
alphaL = [0.01] + [0.05 * n for n in range(1, 21)]
ave_rms_aveL = [RunningAve(name='alpha=%g' % alpha) for alpha in alphaL]

for ialpha, alpha in enumerate(alphaL):

    for loop in range(100):  # average rms curves over 100 runs
        sv = StateValueColl(rw_mrp, init_val=0.5)

        resultL, value_snapD = td0_prediction(
            policy,
            sv,
            all_start_states=False,
            do_summ_print=False,
            show_last_change=False,
            show_banner=False,
            pcent_progress_print=0,
            alpha=alpha,