예제 #1
0
    def test_initialized_state_action_values(self):

        current_state = 's1'
        available_actions = {'a1', 'a2'}
        policy = QTablePolicy()

        policy.initialize_state('s1', {'a1', 'a2'})
        self.assertEqual(0.0, policy.value_for('s1', 'a1'))
        self.assertEqual(0.0, policy.value_for('s1', 'a2'))
예제 #2
0
    def test_qlearning_updates_both_policies(self):

        target_policy = QTablePolicy()
        behavior_policy = QTablePolicy()
        control = QLearningControl(target_policy, behavior_policy, alpha=0.2)

        mdp = VacuumCleanerWorldBuilder().build_mdp()
        control.learn_episode(mdp, step_limit=3)

        self.assertEqual(target_policy._q_table, behavior_policy._q_table)
예제 #3
0
    def test_value_for(self):
        planner_policy = MagicMock()
        qtable_policy = QTablePolicy()
        random_policy = RandomPolicy()
        policy = PlanningExploringStartsPolicy(planner_policy, random_policy,
                                               qtable_policy)

        # Evaluation of a state-action pair should be the same as for the qtable policy.
        policy.initialize_state('s1', {'a1', 'a2'})
        policy.update('s1', 'a2', -1.23)
        self.assertEqual(-1.23, policy.value_for('s1', 'a2'))
예제 #4
0
    def test_initialized_random_action_suggestion(self):

        current_state = 's1'
        available_actions = {'a1', 'a2'}
        policy = QTablePolicy()

        policy.initialize_state(current_state, available_actions)

        # If the state is not yet known, a random available action is returned.
        # We use a mock method that replaces `random.choice` for testing.
        mocked_random_choice = MagicMock(return_value='a2')
        with patch('random.choice', mocked_random_choice):

            a0 = policy.suggest_action_for_state(current_state)

            # Result is determined by the mocked "random" choice
            self.assertEqual('a2', a0)
            self.assertTrue(a0 in available_actions)

            # Arguments of the mocked "random" choice should be all available actions
            mocked_random_choice.assert_called_with(list(available_actions))
예제 #5
0
    def test_generate_episode_with_target_policy(self):

        target_policy = GuidedPolicy(
            ['move(right)', 'move(left)', 'vacuum', 'move(right)', 'vacuum'])
        behavior_policy = QTablePolicy()

        control = OffPolicyControl(target_policy, behavior_policy)

        mdp = VacuumCleanerWorldBuilder().build_mdp()
        control.generate_episode_with_target_policy(mdp)
        self.assertEqual([None, -1, -1, -1, -1, 99], mdp.reward_history)
        self.assertEqual(95, mdp.return_history[0])
예제 #6
0
    def test_update(self):

        planner_policy = MagicMock()
        qtable_policy = MagicMock(spec=QTablePolicy())
        random_policy = MagicMock(spec=RandomPolicy())
        policy = PlanningExploringStartsPolicy(planner_policy, random_policy,
                                               qtable_policy)

        # Updating the policy should update the qtable policy as well.
        policy.initialize_state('s1', {'a1', 'a2'})
        policy.update('s1', 'a2', -1.23)
        qtable_policy.update.assert_called_with('s1', 'a2', -1.23)
예제 #7
0
    def test_new_state(self):

        policy = QTablePolicy()

        self.assertTrue(policy.is_new_state(state='s1'))

        policy.initialize_state(state='s1', available_actions={'a(1)', 'a(2)'})
        self.assertFalse(policy.is_new_state('s1'))
예제 #8
0
    def test_optimal_value_for(self):

        planner_policy = MagicMock()
        qtable_policy = QTablePolicy()
        random_policy = RandomPolicy()
        policy = PlanningExploringStartsPolicy(planner_policy, random_policy,
                                               qtable_policy)

        # Evaluation of a state-action pair should be the same as for the qtable policy.
        policy.initialize_state('s', {'a', 'b', 'c'})
        policy.update('s', 'a', 1.23)
        policy.update('s', 'b', -5.43)
        policy.update('s', 'c', 0.03)
        self.assertEqual(1.23, policy.optimal_value_for('s'))
예제 #9
0
    def test_episode_time_limit(self):

        # If a time limit is set, the control algorithm should stop after reaching that limit.

        target_policy = QTablePolicy()
        behavior_policy = GuidedPolicy(['move(right)', 'move(left)'] * 100)
        control = OffPolicyControl(target_policy, behavior_policy)
        control.policy_update_after_step = MagicMock()

        mdp = VacuumCleanerWorldBuilder().build_mdp()
        mdp.transition = MagicMock(return_value=('some_next_state', -1))
        control.learn_episode(mdp, step_limit=3)

        self.assertEqual(3, len(control.policy_update_after_step.mock_calls))
        self.assertEqual(3, len(mdp.transition.mock_calls))
예제 #10
0
    def test_is_new_state(self):

        qtable_policy = QTablePolicy()
        random_policy = RandomPolicy()

        mdp_builder = VacuumCleanerWorldBuilder()
        mdp = mdp_builder.build_mdp()
        planner_policy = PlannerPolicy(planning_horizon=1,
                                       mdp_builder=mdp_builder)

        policy = PlanningExploringStartsPolicy(planner_policy, random_policy,
                                               qtable_policy)

        self.assertTrue(policy.is_new_state(state='s1'))
        policy.initialize_state(state='s1', available_actions={'a(1)', 'a(2)'})
        self.assertFalse(policy.is_new_state('s1'))
예제 #11
0
    def test_terminal_state(self):

        policy = QTablePolicy()
        policy.initialize_state(
            's0', set())  # s0 is terminal because it has no available actions.

        action = policy.suggest_action_for_state('s0')
        self.assertIsNone(action)

        value = policy.value_for('s0', None)
        self.assertEqual(0,
                         value)  # Terminal states always have a value of zero.
예제 #12
0
    def test_monte_carlo_sgd_control_1(self):

        target_policy = QTablePolicy()
        behavior_policy = GuidedPolicy(['vacuum', 'move(right)', 'vacuum'])
        control = MonteCarloSGDControl(target_policy, alpha=0.4)

        # Even though this Monte-Carlo control is on-policy, we "cheat" for testing purposes
        # and give it a separate behavior policy.
        control.behavior_policy = behavior_policy

        mdp = VacuumCleanerWorldBuilder().build_mdp()
        control.learn_episode(mdp)

        s0 = frozenset({'robot(left)', 'dirty(left)', 'dirty(right)'})
        self.assertEqual(0.4 * 97, target_policy.value_for(s0, 'vacuum'))
        self.assertEqual(0, target_policy.value_for(s0, 'move(right)'))

        s1 = frozenset({'robot(left)', 'dirty(right)'})
        self.assertEqual(0.4 * 98, target_policy.value_for(s1, 'move(right)'))

        s2 = frozenset({'robot(right)', 'dirty(right)'})
        self.assertEqual(0.4 * 99, target_policy.value_for(s2, 'vacuum'))
        self.assertEqual(0, target_policy.value_for(s2, 'move(left)'))
예제 #13
0
    def test_qlearning_reversed_update_control_1(self):

        target_policy = QTablePolicy()
        behavior_policy = GuidedPolicy(['vacuum', 'move(right)', 'vacuum'] * 2)
        control = QLearningReversedUpdateControl(target_policy,
                                                 behavior_policy,
                                                 alpha=0.2)

        mdp_builder = VacuumCleanerWorldBuilder()
        mdp = mdp_builder.build_mdp()
        control.learn_episode(mdp)

        lr = 0.2
        s2_val = lr * 99
        s1_val = lr * (-1 + s2_val)
        s0_val = lr * (-1 + s1_val)

        s0 = frozenset({'robot(left)', 'dirty(left)', 'dirty(right)'})
        self.assertEqual(s0_val, target_policy.value_for(s0, 'vacuum'))
        self.assertEqual(0, target_policy.value_for(s0, 'move(right)'))

        s1 = frozenset({'robot(left)', 'dirty(right)'})
        self.assertEqual(s1_val, target_policy.value_for(s1, 'move(right)'))

        s2 = frozenset({'robot(right)', 'dirty(right)'})
        self.assertEqual(s2_val, target_policy.value_for(s2, 'vacuum'))
        self.assertEqual(0, target_policy.value_for(s2, 'move(left)'))

        # After being guided to the goal, the target policy should know the way to the goal.
        test_mdp = mdp_builder.build_mdp()
        self.assertSetEqual(s0, test_mdp.state)
        self.assertEqual(
            'vacuum', target_policy.suggest_action_for_state(test_mdp.state))
        test_mdp.transition('vacuum')
        self.assertEqual(
            'move(right)',
            target_policy.suggest_action_for_state(test_mdp.state))
        test_mdp.transition('move(right)')
        self.assertEqual(
            'vacuum', target_policy.suggest_action_for_state(test_mdp.state))
        test_mdp.transition('vacuum')
        self.assertEqual(
            None, target_policy.suggest_action_for_state(test_mdp.state))

        # Repeat with a second mdp and exactly the same actions.
        # The value estimates should propagate.

        mdp = VacuumCleanerWorldBuilder().build_mdp()
        control.learn_episode(mdp)

        self.assertSetEqual(frozenset({'robot(right)'}), mdp.state)

        s2_val += lr * (99 + 0 - s2_val)
        s1_val += lr * (-1 + s2_val - s1_val)
        s0_val += lr * (-1 + s1_val - s0_val)

        s0 = frozenset({'robot(left)', 'dirty(left)', 'dirty(right)'})
        self.assertEqual(s0_val, target_policy.value_for(s0, 'vacuum'))
        self.assertEqual(0, target_policy.value_for(s0, 'move(right)'))

        s1 = frozenset({'robot(left)', 'dirty(right)'})
        self.assertEqual(s1_val, target_policy.value_for(s1, 'move(right)'))

        s2 = frozenset({'robot(right)', 'dirty(right)'})
        self.assertEqual(s2_val, target_policy.value_for(s2, 'vacuum'))
        self.assertEqual(0, target_policy.value_for(s2, 'move(left)'))
예제 #14
0
    def test_tie_breaking(self):

        policy = QTablePolicy()
        policy.initialize_state('s0', ['a1', 'a2', 'a3', 'a4', 'a5', 'a6'])
        policy.update(state='s0', action='a1', delta=5.2)
        policy.update(state='s0', action='a2', delta=3.2)
        policy.update(state='s0', action='a3', delta=5.2)
        policy.update(state='s0', action='a4', delta=-6.9)
        policy.update(state='s0', action='a6', delta=5.2)

        # Multiple actions have the same state-value-estimate:
        self.assertEqual(5.2, policy.value_for('s0', 'a1'))
        self.assertEqual(5.2, policy.value_for('s0', 'a3'))
        self.assertEqual(5.2, policy.value_for('s0', 'a6'))

        # When choosing the greedy action, this tie will be broken randomly.
        # We test this by creating a mock `random.choice` and check if the
        # random choice is made between the correct options {a1, a3, a6}.
        mocked_random_choice = MagicMock(return_value='a6')
        with patch('random.choice', mocked_random_choice):

            a0 = policy.suggest_action_for_state('s0')

            # Result is determined by the mocked "random" choice
            self.assertEqual('a6', a0)

            # The random choice should only happen between optimal actions.
            mocked_random_choice.assert_called_with(['a1', 'a3', 'a6'])
예제 #15
0
    def test_update(self):

        current_state = 's0'
        available_actions = {'a1', 'a2'}

        policy = QTablePolicy()
        policy.initialize_state('s0', {'a1', 'a2'})

        # An update of 5 for (s0, a2) should make a2 the greedy choice.
        # TODO Is this update an "abstract" one or something fixed like "reward"???
        policy.update(state='s0', action='a2', delta=5.0)
        self.assertEqual(0.0, policy.value_for('s0', 'a1'))
        self.assertEqual(5.0, policy.value_for('s0', 'a2'))
        self.assertEqual('a2', policy.suggest_action_for_state('s0'))

        # An update of 4 for (s0, a1) should keep a2 as the greedy choice.
        policy.update(state='s0', action='a1', delta=4.0)
        self.assertEqual(4.0, policy.value_for('s0', 'a1'))
        self.assertEqual(5.0, policy.value_for('s0', 'a2'))
        self.assertEqual('a2', policy.suggest_action_for_state('s0'))

        # An update of -2 for (s0, a2) should make a1 the greedy choice.
        policy.update(state='s0', action='a2', delta=-2.6)
        self.assertEqual(4.0, policy.value_for('s0', 'a1'))
        self.assertEqual(2.4, policy.value_for('s0', 'a2'))
        self.assertEqual('a1', policy.suggest_action_for_state('s0'))
예제 #16
0
    def test_qlearning_control_1(self):

        target_policy = QTablePolicy()
        behavior_policy = GuidedPolicy(['vacuum', 'move(right)', 'vacuum'] * 2)
        control = QLearningControl(target_policy, behavior_policy, alpha=0.3)

        mdp = VacuumCleanerWorldBuilder().build_mdp()
        control.learn_episode(mdp)

        s0 = frozenset({'robot(left)', 'dirty(left)', 'dirty(right)'})
        self.assertEqual(-0.3, target_policy.value_for(s0, 'vacuum'))
        self.assertEqual(0, target_policy.value_for(s0, 'move(right)'))

        s1 = frozenset({'robot(left)', 'dirty(right)'})
        self.assertEqual(-0.3, target_policy.value_for(s1, 'move(right)'))

        s2 = frozenset({'robot(right)', 'dirty(right)'})
        self.assertEqual(99 * 0.3, target_policy.value_for(s2, 'vacuum'))
        self.assertEqual(0, target_policy.value_for(s2, 'move(left)'))

        # Repeat with a second mdp and exactly the same actions.
        # The value estimates should propagate.

        mdp = VacuumCleanerWorldBuilder().build_mdp()
        control.learn_episode(mdp)

        s0 = frozenset({'robot(left)', 'dirty(left)', 'dirty(right)'})
        self.assertEqual(-0.3 - 0.3, target_policy.value_for(s0, 'vacuum'))
        self.assertEqual(0, target_policy.value_for(s0, 'move(right)'))

        s1 = frozenset({'robot(left)', 'dirty(right)'})
        self.assertEqual(-0.3 + 0.3 * (-1 + 99 * 0.3 - (-0.3)),
                         target_policy.value_for(s1, 'move(right)'))

        s2 = frozenset({'robot(right)', 'dirty(right)'})
        self.assertEqual(99 * 0.3 + 0.3 * (99 + 0 - 99 * 0.3),
                         target_policy.value_for(s2, 'vacuum'))
        self.assertEqual(0, target_policy.value_for(s2, 'move(left)'))
예제 #17
0
    def test_monte_carlo_control_1(self):

        target_policy = QTablePolicy()
        behavior_policy_1 = GuidedPolicy([
            'move(right)', 'move(left)', 'move(right)', 'move(left)', 'vacuum',
            'move(right)', 'vacuum'
        ])
        behavior_policy_2 = GuidedPolicy(
            ['move(right)', 'vacuum', 'move(left)', 'vacuum'])

        control = FirstVisitMonteCarloControl(target_policy)

        # Even though this Monte-Carlo control is on-policy, we "cheat" for testing purposes
        # and give it a separate behavior policy.
        control.behavior_policy = behavior_policy_1

        mdp = VacuumCleanerWorldBuilder().build_mdp()
        control.learn_episode(mdp)

        s0 = frozenset({'robot(left)', 'dirty(left)', 'dirty(right)'})
        self.assertEqual(97, target_policy.value_for(s0, 'vacuum'))
        self.assertEqual(93, target_policy.value_for(
            s0, 'move(right)'))  # Only first visit counts

        s1 = frozenset({'robot(left)', 'dirty(right)'})
        self.assertEqual(98, target_policy.value_for(s1, 'move(right)'))

        s2 = frozenset({'robot(right)', 'dirty(right)'})
        self.assertEqual(99, target_policy.value_for(s2, 'vacuum'))
        self.assertEqual(0, target_policy.value_for(s2, 'move(left)'))

        s3 = frozenset({'robot(right)', 'dirty(left)', 'dirty(right)'})
        self.assertEqual(0, target_policy.value_for(s3, 'vacuum'))
        self.assertEqual(94, target_policy.value_for(s3, 'move(left)'))

        # Let's try a second episode with a different route

        # Even though this Monte-Carlo control is on-policy, we "cheat" for testing purposes
        # and give it a separate behavior policy.
        control.behavior_policy = behavior_policy_2

        mdp = VacuumCleanerWorldBuilder().build_mdp()
        control.learn_episode(mdp)

        s0 = frozenset({'robot(left)', 'dirty(left)', 'dirty(right)'})
        self.assertEqual(97, target_policy.value_for(s0, 'vacuum'))
        self.assertEqual((93 + 96) / 2.0,
                         target_policy.value_for(s0, 'move(right)'))

        s1 = frozenset({'robot(left)', 'dirty(right)'})
        self.assertEqual(98, target_policy.value_for(s1, 'move(right)'))

        s2 = frozenset({'robot(right)', 'dirty(right)'})
        self.assertEqual(99, target_policy.value_for(s2, 'vacuum'))
        self.assertEqual(0, target_policy.value_for(s2, 'move(left)'))

        s3 = frozenset({'robot(right)', 'dirty(left)', 'dirty(right)'})
        self.assertEqual(97, target_policy.value_for(s3, 'vacuum'))
        self.assertEqual(94, target_policy.value_for(s3, 'move(left)'))
예제 #18
0
    def test_optimal_value_for(self):

        policy = QTablePolicy()
        policy.initialize_state('s', {'a', 'b', 'c'})
        policy.update('s', 'a', -1)
        policy.update('s', 'b', 0)
        policy.update('s', 'c', 10)

        self.assertEqual(10, policy.optimal_value_for('s'))