class MyTest(unittest.TestCase): def setUp(self): unittest.TestCase.setUp(self) self.gridworld = get_gridworld() self.P = Policy(environment=self.gridworld) self.P.intialize_policy_to_equiprobable(env=self.gridworld) def tearDown(self): unittest.TestCase.tearDown(self) del (self.P) def test_should_always_pass_cleanly(self): """Should always pass cleanly.""" pass def test_myclass_existence(self): """Check that myclass exists""" # See if the self.P object exists self.assertIsInstance(self.P, Policy, msg=None) def test_set_policy_from_default_pi(self): """test set policy from default pi""" policyD = self.gridworld.get_default_policy_desc_dict() self.P.set_policy_from_piD(policyD) self.assertEqual(self.P.get_action_prob((2, 2), 'U'), 1.0) self.assertEqual(self.P.get_action_prob((2, 2), 'R'), 0.0) self.assertEqual(self.P.get_action_prob((2, 2), 'D'), None) #def test_set_policy_from_list_of_actions(self): # """test set policy from list of actions""" # piD = {(0, 0):('R','D') } # self.P.set_policy_from_piD( piD ) # self.assertEqual(self.P.get_action_prob( (0,0), 'U'), None) # self.assertEqual(self.P.get_action_prob( (0,0), 'R'), 0.5) # self.assertEqual(self.P.get_action_prob( (0,0), 'D'), 0.5) #def test_set_policy_from_list_of_action_probs(self): # """test set policy from list of action probs""" # piD = {(0, 0):[('R',0.6), ('D',0.4)] } # self.P.set_policy_from_piD( piD ) # self.assertEqual(self.P.get_action_prob( (0,0), 'U'), None) # self.assertEqual(self.P.get_action_prob( (0,0), 'R'), 0.6) # self.assertEqual(self.P.get_action_prob( (0,0), 'D'), 0.4) # # make (action, prob) entry too long. # with self.assertRaises(ValueError): # piD = {(0, 0):[('R',0.6,0.4), ('D',0.4,0.6)] } # self.P.set_policy_from_piD( piD ) def test_learn_all_s_and_a(self): """test learn all s and a""" self.P.learn_all_states_and_actions_from_env(self.gridworld) def test_initialize_to_random(self): """test initialize to random""" self.P.intialize_policy_to_random(env=self.gridworld) apL = self.P.get_list_of_all_action_desc_prob((0, 2), incl_zero_prob=True) pL = [p for (adesc, p) in apL] self.assertEqual(sorted(pL), [0.0, 0.0, 1.0]) def test_iterate_adesc_p(self): """test iterate adesc p""" apL = [] for (a_desc, p) in self.P.iter_policy_ap_for_state( (0, 0), incl_zero_prob=False): apL.append((a_desc, p)) self.assertIn(('R', 0.5), apL) self.assertIn(('D', 0.5), apL) self.assertNotIn(('U', 0.5), apL) def test_iterate_all_states(self): """test iterate all states""" sL = [] for s_hash in self.P.iter_all_policy_states(): sL.append(s_hash) sL.sort() self.assertEqual(len(sL), 9) self.assertEqual(sL[0], (0, 0)) self.assertEqual(sL[-1], (2, 3)) def test_get_single_action(self): """test get single action""" a_desc = self.P.get_single_action((0, 0)) self.assertIn(a_desc, ('R', 'D')) a_desc = self.P.get_single_action((99, 99)) self.assertEqual(a_desc, None)
if do_VI: policy, sv = dp_value_iteration(robot, do_summ_print=False, fmt_V='%.1f', max_iter=1000, err_delta=0.001, gamma=gamma) else: policy = Policy(environment=robot) policy.set_policy_from_piD(robot.get_default_policy_desc_dict()) sv = StateValues(robot) sv.init_Vs_to_zero() dp_policy_iteration(policy, sv, do_summ_print=False, max_iter=1000, err_delta=0.001, gamma=gamma) print('gamma=%5g' % gamma, ' Fallen=', policy.get_single_action('Fallen'), ' Moving=', policy.get_single_action('Moving'), ' Standing=', policy.get_single_action('Standing'), ' Fallen=', '%g' % sv.VsD['Fallen'], ' Moving=', '%g' % sv.VsD['Moving'], ' Standing=', '%g' % sv.VsD['Standing']) print(robot.get_info())
def mc_epsilon_greedy( environment, initial_policy='default', # can be 'default', 'random', policy_dictionary read_pickle_file='', save_pickle_file='', use_list_of_start_states=True, # use list OR single start state of environment. iter_all_start_actions=False, # pick random or iterate all starting actions first_visit=True, do_summ_print=True, showRunningAve=False, fmt_Q='%g', fmt_R='%g', show_initial_policy=True, max_num_episodes=1000, min_num_episodes=10, max_abserr=0.001, gamma=0.9, iteration_prints=0, max_episode_steps=10000, epsilon=0.1, const_epsilon=True, half_life=200, N_episodes_wo_decay=0): """ ... GIVEN AN ENVIRONMENT ... apply Monte Carlo Exploring Starts to find the OPTIMAL POLICY Returns: Policy and ActionValueRunAveColl objects Use Episode Discounted Returns to find Q(s,a), Action-Value Function Terminates when abserr < max_abserr Assume that Q(s,a), action_value_ave, has been initialized prior to call. Assume environment attached to policy will have method "get_any_action_state_hash" in order to begin at any action state. CREATES BOTH policy AND action_value OBJECTS. """ eps_greedy = EpsilonGreedy(epsilon=epsilon, const_epsilon=const_epsilon, half_life=half_life, N_episodes_wo_decay=N_episodes_wo_decay) # create Policy and ActionValueRunAveColl objects policy = Policy(environment=environment) if initial_policy == 'default': print('Initializing Policy to "default" in mc_epsilon_greedy') policy.learn_a_legal_action_from_env(env=environment) policy.set_policy_from_piD(environment.get_default_policy_desc_dict()) elif initial_policy == 'random': print('Initializing Policy to "random" in mc_epsilon_greedy') policy.intialize_policy_to_random(env=environment) else: print('Initializing Policy to "custom policy" in mc_epsilon_greedy') policy.learn_a_legal_action_from_env(env=environment) policy.set_policy_from_piD(initial_policy) action_value_ave = ActionValueRunAveColl(environment) action_value_ave.init_Qsa_to_zero( ) # Terminal states w/o an action are NOT included #action_value_ave.summ_print() if read_pickle_file: policy.init_from_pickle_file(read_pickle_file) action_value_ave.init_from_pickle_file(read_pickle_file) if do_summ_print: if show_initial_policy: print( '=============== STARTING WITH THE INITIAL POLICY ====================' ) policy.summ_print(verbosity=0, environment=environment, show_env_states=False, none_str='*') print( '================== EPSILON GREEDY DEFINED AS ========================' ) eps_greedy.summ_print() s = 'Starting a Maximum of %i Monte Carlo Epsilon Greedy\nfor "%s" with Gamma = %g'%\ (max_num_episodes, environment.name, gamma) banner(s, banner_char='', leftMargin=0, just='center') # create an Episode object for getting returns episode = Episode(environment.name + ' Episode') # set counter and flag num_episodes = 0 keep_looping = True limited_start_stateL = environment.limited_start_state_list() progress_str = '' while (num_episodes <= max_num_episodes - 1) and keep_looping: keep_looping = False abserr = 0.0 # calculated below as part of termination criteria Nterminal_episodes = set() # Iterate over a list of known possible start states if use_list_of_start_states: loop_stateL = limited_start_stateL random.shuffle(loop_stateL) else: #loop_stateL = [ random.choice( limited_start_stateL ) ] loop_stateL = [environment.start_state_hash] for start_hash in loop_stateL: if iter_all_start_actions: # Iterate over ALL ACTIONS of start_hash a_descL = environment.get_state_legal_action_list(start_hash) else: a_desc = policy.get_single_action(start_hash) # if not iterating all actions, make sure first action has eps-greedy applied a_desc = eps_greedy( a_desc, environment.get_state_legal_action_list(start_hash)) a_descL = [a_desc] # randomize action order random.shuffle(a_descL) for a_desc in a_descL: # break from inner loop if max_num_episodes is hit. if num_episodes >= max_num_episodes: break make_episode(start_hash, policy, environment, environment.terminal_set, episode=episode, first_a_desc=a_desc, max_steps=max_episode_steps, eps_greedy=eps_greedy) eps_greedy.inc_N_episodes() num_episodes += 1 if episode.is_done(): Nterminal_episodes.add(start_hash) for dr in episode.get_rev_discounted_returns( gamma=gamma, first_visit=first_visit, visit_type='SA'): # look at each step from episode and calc average Q(s,a) (s, a, r, sn, G) = dr action_value_ave.add_val(s, a, G) aL = environment.get_state_legal_action_list(s) if aL: best_a_desc, best_a_val = aL[0], float('-inf') bestL = [best_a_desc] for a in aL: q = action_value_ave.get_ave(s, a) if q > best_a_val: best_a_desc, best_a_val = a, q bestL = [a] elif q == best_a_val: bestL.append(a) best_a_desc = random.choice(bestL) policy.set_sole_action(s, best_a_desc) abserr = action_value_ave.get_biggest_action_state_err() if abserr > max_abserr: keep_looping = True if num_episodes < min_num_episodes: keep_looping = True # must loop for min_num_episodes at least pc_done = 100.0 * float(num_episodes) / float(max_num_episodes) out_str = '%3i%%' % (5 * (int(pc_done / 5.0))) if out_str != progress_str: score = environment.get_policy_score(policy=policy, start_state_hash=None, step_limit=1000) print(out_str, ' score=%s' % str(score), ' = (r_sum, n_steps, msg)', end=' ') print('Nterminal episodes =', len(Nterminal_episodes), ' of ', len(loop_stateL)) progress_str = out_str print() if do_summ_print: s = '' if num_episodes >= max_num_episodes: s = ' (NOTE: STOPPED ON MAX-ITERATIONS)' print('Exited Epsilon Greedy, MC First-Visit Value Iterations', s) print(' num episodes =', num_episodes, ' (min limit=%i)' % min_num_episodes, ' (max limit=%i)' % max_num_episodes) print(' gamma =', gamma) print(' estimated err =', abserr) print(' Error limit =', max_abserr) print('Nterminal episodes =', len(Nterminal_episodes), ' of ', len(loop_stateL)) action_value_ave.summ_print(showRunningAve=showRunningAve, fmt_Q=fmt_Q) policy.summ_print(environment=environment, verbosity=0, show_env_states=False) try: # sims may not have a layout_print environment.layout_print(vname='reward', fmt=fmt_R, show_env_states=False, none_str='*') except: pass if save_pickle_file: policy.save_to_pickle_file(save_pickle_file) action_value_ave.save_to_pickle_file(save_pickle_file) return policy, action_value_ave