예제 #1
0
class MyTest(unittest.TestCase):
    def setUp(self):
        unittest.TestCase.setUp(self)
        self.gridworld = get_gridworld()
        self.P = Policy(environment=self.gridworld)
        self.P.intialize_policy_to_equiprobable(env=self.gridworld)

    def tearDown(self):
        unittest.TestCase.tearDown(self)
        del (self.P)

    def test_should_always_pass_cleanly(self):
        """Should always pass cleanly."""
        pass

    def test_myclass_existence(self):
        """Check that myclass exists"""

        # See if the self.P object exists
        self.assertIsInstance(self.P, Policy, msg=None)

    def test_set_policy_from_default_pi(self):
        """test set policy from default pi"""

        policyD = self.gridworld.get_default_policy_desc_dict()
        self.P.set_policy_from_piD(policyD)

        self.assertEqual(self.P.get_action_prob((2, 2), 'U'), 1.0)
        self.assertEqual(self.P.get_action_prob((2, 2), 'R'), 0.0)
        self.assertEqual(self.P.get_action_prob((2, 2), 'D'), None)

    #def test_set_policy_from_list_of_actions(self):
    #    """test set policy from list of actions"""
    #    piD = {(0, 0):('R','D') }
    #    self.P.set_policy_from_piD( piD )

    #    self.assertEqual(self.P.get_action_prob( (0,0), 'U'), None)
    #    self.assertEqual(self.P.get_action_prob( (0,0), 'R'), 0.5)
    #    self.assertEqual(self.P.get_action_prob( (0,0), 'D'), 0.5)

    #def test_set_policy_from_list_of_action_probs(self):
    #    """test set policy from list of action probs"""
    #    piD = {(0, 0):[('R',0.6), ('D',0.4)] }
    #    self.P.set_policy_from_piD( piD )

    #    self.assertEqual(self.P.get_action_prob( (0,0), 'U'), None)
    #    self.assertEqual(self.P.get_action_prob( (0,0), 'R'), 0.6)
    #    self.assertEqual(self.P.get_action_prob( (0,0), 'D'), 0.4)

    #    # make (action, prob) entry too long.
    #    with self.assertRaises(ValueError):
    #        piD = {(0, 0):[('R',0.6,0.4), ('D',0.4,0.6)] }
    #        self.P.set_policy_from_piD( piD )

    def test_learn_all_s_and_a(self):
        """test learn all s and a"""

        self.P.learn_all_states_and_actions_from_env(self.gridworld)

    def test_initialize_to_random(self):
        """test initialize to random"""

        self.P.intialize_policy_to_random(env=self.gridworld)
        apL = self.P.get_list_of_all_action_desc_prob((0, 2),
                                                      incl_zero_prob=True)
        pL = [p for (adesc, p) in apL]
        self.assertEqual(sorted(pL), [0.0, 0.0, 1.0])

    def test_iterate_adesc_p(self):
        """test iterate adesc p"""

        apL = []
        for (a_desc, p) in self.P.iter_policy_ap_for_state(
            (0, 0), incl_zero_prob=False):
            apL.append((a_desc, p))

        self.assertIn(('R', 0.5), apL)
        self.assertIn(('D', 0.5), apL)
        self.assertNotIn(('U', 0.5), apL)

    def test_iterate_all_states(self):
        """test iterate all states"""

        sL = []
        for s_hash in self.P.iter_all_policy_states():
            sL.append(s_hash)
        sL.sort()
        self.assertEqual(len(sL), 9)
        self.assertEqual(sL[0], (0, 0))
        self.assertEqual(sL[-1], (2, 3))

    def test_get_single_action(self):
        """test get single action"""
        a_desc = self.P.get_single_action((0, 0))
        self.assertIn(a_desc, ('R', 'D'))

        a_desc = self.P.get_single_action((99, 99))
        self.assertEqual(a_desc, None)
예제 #2
0
    if do_VI:
        policy, sv = dp_value_iteration(robot,
                                        do_summ_print=False,
                                        fmt_V='%.1f',
                                        max_iter=1000,
                                        err_delta=0.001,
                                        gamma=gamma)
    else:

        policy = Policy(environment=robot)
        policy.set_policy_from_piD(robot.get_default_policy_desc_dict())

        sv = StateValues(robot)
        sv.init_Vs_to_zero()

        dp_policy_iteration(policy,
                            sv,
                            do_summ_print=False,
                            max_iter=1000,
                            err_delta=0.001,
                            gamma=gamma)

    print('gamma=%5g' % gamma, '  Fallen=', policy.get_single_action('Fallen'),
          '  Moving=', policy.get_single_action('Moving'), '  Standing=',
          policy.get_single_action('Standing'), '  Fallen=',
          '%g' % sv.VsD['Fallen'], '  Moving=', '%g' % sv.VsD['Moving'],
          '  Standing=', '%g' % sv.VsD['Standing'])

print(robot.get_info())
예제 #3
0
def mc_epsilon_greedy(
        environment,
        initial_policy='default',  # can be 'default', 'random', policy_dictionary
        read_pickle_file='',
        save_pickle_file='',
        use_list_of_start_states=True,  # use list OR single start state of environment.
        iter_all_start_actions=False,  # pick random or iterate all starting actions
        first_visit=True,
        do_summ_print=True,
        showRunningAve=False,
        fmt_Q='%g',
        fmt_R='%g',
        show_initial_policy=True,
        max_num_episodes=1000,
        min_num_episodes=10,
        max_abserr=0.001,
        gamma=0.9,
        iteration_prints=0,
        max_episode_steps=10000,
        epsilon=0.1,
        const_epsilon=True,
        half_life=200,
        N_episodes_wo_decay=0):
    """
    ... GIVEN AN ENVIRONMENT ... 
    apply Monte Carlo Exploring Starts to find the OPTIMAL POLICY
    
    Returns: Policy and ActionValueRunAveColl objects
    
    Use Episode Discounted Returns to find Q(s,a), Action-Value Function
    
    Terminates when abserr < max_abserr
    
    Assume that Q(s,a), action_value_ave, has been initialized prior to call.
    
    Assume environment attached to policy will have method "get_any_action_state_hash"
    in order to begin at any action state.
    
    CREATES BOTH policy AND action_value OBJECTS.
    """

    eps_greedy = EpsilonGreedy(epsilon=epsilon,
                               const_epsilon=const_epsilon,
                               half_life=half_life,
                               N_episodes_wo_decay=N_episodes_wo_decay)

    # create Policy and ActionValueRunAveColl objects
    policy = Policy(environment=environment)
    if initial_policy == 'default':
        print('Initializing Policy to "default" in mc_epsilon_greedy')
        policy.learn_a_legal_action_from_env(env=environment)
        policy.set_policy_from_piD(environment.get_default_policy_desc_dict())
    elif initial_policy == 'random':
        print('Initializing Policy to "random" in mc_epsilon_greedy')
        policy.intialize_policy_to_random(env=environment)
    else:
        print('Initializing Policy to "custom policy" in mc_epsilon_greedy')
        policy.learn_a_legal_action_from_env(env=environment)
        policy.set_policy_from_piD(initial_policy)

    action_value_ave = ActionValueRunAveColl(environment)
    action_value_ave.init_Qsa_to_zero(
    )  # Terminal states w/o an action are NOT included
    #action_value_ave.summ_print()

    if read_pickle_file:
        policy.init_from_pickle_file(read_pickle_file)
        action_value_ave.init_from_pickle_file(read_pickle_file)

    if do_summ_print:
        if show_initial_policy:
            print(
                '=============== STARTING WITH THE INITIAL POLICY ===================='
            )
            policy.summ_print(verbosity=0,
                              environment=environment,
                              show_env_states=False,
                              none_str='*')

        print(
            '================== EPSILON GREEDY DEFINED AS ========================'
        )
        eps_greedy.summ_print()

    s = 'Starting a Maximum of %i Monte Carlo Epsilon Greedy\nfor "%s" with Gamma = %g'%\
        (max_num_episodes, environment.name, gamma)
    banner(s, banner_char='', leftMargin=0, just='center')

    # create an Episode object for getting returns
    episode = Episode(environment.name + ' Episode')

    # set counter and flag
    num_episodes = 0
    keep_looping = True

    limited_start_stateL = environment.limited_start_state_list()

    progress_str = ''
    while (num_episodes <= max_num_episodes - 1) and keep_looping:

        keep_looping = False
        abserr = 0.0  # calculated below as part of termination criteria
        Nterminal_episodes = set()

        # Iterate over a list of known possible start states
        if use_list_of_start_states:
            loop_stateL = limited_start_stateL
            random.shuffle(loop_stateL)
        else:
            #loop_stateL = [ random.choice( limited_start_stateL ) ]
            loop_stateL = [environment.start_state_hash]

        for start_hash in loop_stateL:

            if iter_all_start_actions:  # Iterate over ALL ACTIONS of start_hash
                a_descL = environment.get_state_legal_action_list(start_hash)
            else:
                a_desc = policy.get_single_action(start_hash)
                # if not iterating all actions, make sure first action has eps-greedy applied
                a_desc = eps_greedy(
                    a_desc,
                    environment.get_state_legal_action_list(start_hash))
                a_descL = [a_desc]
            # randomize action order
            random.shuffle(a_descL)

            for a_desc in a_descL:

                # break from inner loop if max_num_episodes is hit.
                if num_episodes >= max_num_episodes:
                    break

                make_episode(start_hash,
                             policy,
                             environment,
                             environment.terminal_set,
                             episode=episode,
                             first_a_desc=a_desc,
                             max_steps=max_episode_steps,
                             eps_greedy=eps_greedy)
                eps_greedy.inc_N_episodes()
                num_episodes += 1

                if episode.is_done():
                    Nterminal_episodes.add(start_hash)

                for dr in episode.get_rev_discounted_returns(
                        gamma=gamma, first_visit=first_visit, visit_type='SA'):
                    # look at each step from episode and calc average Q(s,a)
                    (s, a, r, sn, G) = dr
                    action_value_ave.add_val(s, a, G)

                    aL = environment.get_state_legal_action_list(s)
                    if aL:
                        best_a_desc, best_a_val = aL[0], float('-inf')
                        bestL = [best_a_desc]
                        for a in aL:
                            q = action_value_ave.get_ave(s, a)
                            if q > best_a_val:
                                best_a_desc, best_a_val = a, q
                                bestL = [a]
                            elif q == best_a_val:
                                bestL.append(a)
                        best_a_desc = random.choice(bestL)
                        policy.set_sole_action(s, best_a_desc)

        abserr = action_value_ave.get_biggest_action_state_err()
        if abserr > max_abserr:
            keep_looping = True

        if num_episodes < min_num_episodes:
            keep_looping = True  # must loop for min_num_episodes at least

        pc_done = 100.0 * float(num_episodes) / float(max_num_episodes)
        out_str = '%3i%%' % (5 * (int(pc_done / 5.0)))
        if out_str != progress_str:
            score = environment.get_policy_score(policy=policy,
                                                 start_state_hash=None,
                                                 step_limit=1000)
            print(out_str,
                  ' score=%s' % str(score),
                  ' = (r_sum, n_steps, msg)',
                  end=' ')
            print('Nterminal episodes =', len(Nterminal_episodes), ' of ',
                  len(loop_stateL))
            progress_str = out_str
    print()
    if do_summ_print:
        s = ''
        if num_episodes >= max_num_episodes:
            s = '   (NOTE: STOPPED ON MAX-ITERATIONS)'

        print('Exited Epsilon Greedy, MC First-Visit Value Iterations', s)
        print('   num episodes    =', num_episodes,
              ' (min limit=%i)' % min_num_episodes,
              ' (max limit=%i)' % max_num_episodes)
        print('   gamma           =', gamma)
        print('   estimated err   =', abserr)
        print('   Error limit     =', max_abserr)
        print('Nterminal episodes =', len(Nterminal_episodes), ' of ',
              len(loop_stateL))

        action_value_ave.summ_print(showRunningAve=showRunningAve, fmt_Q=fmt_Q)
        policy.summ_print(environment=environment,
                          verbosity=0,
                          show_env_states=False)

        try:  # sims may not have a layout_print
            environment.layout_print(vname='reward',
                                     fmt=fmt_R,
                                     show_env_states=False,
                                     none_str='*')
        except:
            pass

    if save_pickle_file:
        policy.save_to_pickle_file(save_pickle_file)
        action_value_ave.save_to_pickle_file(save_pickle_file)

    return policy, action_value_ave