def test_initialized_state_action_values(self): current_state = 's1' available_actions = {'a1', 'a2'} policy = QTablePolicy() policy.initialize_state('s1', {'a1', 'a2'}) self.assertEqual(0.0, policy.value_for('s1', 'a1')) self.assertEqual(0.0, policy.value_for('s1', 'a2'))
def test_qlearning_updates_both_policies(self): target_policy = QTablePolicy() behavior_policy = QTablePolicy() control = QLearningControl(target_policy, behavior_policy, alpha=0.2) mdp = VacuumCleanerWorldBuilder().build_mdp() control.learn_episode(mdp, step_limit=3) self.assertEqual(target_policy._q_table, behavior_policy._q_table)
def test_value_for(self): planner_policy = MagicMock() qtable_policy = QTablePolicy() random_policy = RandomPolicy() policy = PlanningExploringStartsPolicy(planner_policy, random_policy, qtable_policy) # Evaluation of a state-action pair should be the same as for the qtable policy. policy.initialize_state('s1', {'a1', 'a2'}) policy.update('s1', 'a2', -1.23) self.assertEqual(-1.23, policy.value_for('s1', 'a2'))
def test_initialized_random_action_suggestion(self): current_state = 's1' available_actions = {'a1', 'a2'} policy = QTablePolicy() policy.initialize_state(current_state, available_actions) # If the state is not yet known, a random available action is returned. # We use a mock method that replaces `random.choice` for testing. mocked_random_choice = MagicMock(return_value='a2') with patch('random.choice', mocked_random_choice): a0 = policy.suggest_action_for_state(current_state) # Result is determined by the mocked "random" choice self.assertEqual('a2', a0) self.assertTrue(a0 in available_actions) # Arguments of the mocked "random" choice should be all available actions mocked_random_choice.assert_called_with(list(available_actions))
def test_generate_episode_with_target_policy(self): target_policy = GuidedPolicy( ['move(right)', 'move(left)', 'vacuum', 'move(right)', 'vacuum']) behavior_policy = QTablePolicy() control = OffPolicyControl(target_policy, behavior_policy) mdp = VacuumCleanerWorldBuilder().build_mdp() control.generate_episode_with_target_policy(mdp) self.assertEqual([None, -1, -1, -1, -1, 99], mdp.reward_history) self.assertEqual(95, mdp.return_history[0])
def test_update(self): planner_policy = MagicMock() qtable_policy = MagicMock(spec=QTablePolicy()) random_policy = MagicMock(spec=RandomPolicy()) policy = PlanningExploringStartsPolicy(planner_policy, random_policy, qtable_policy) # Updating the policy should update the qtable policy as well. policy.initialize_state('s1', {'a1', 'a2'}) policy.update('s1', 'a2', -1.23) qtable_policy.update.assert_called_with('s1', 'a2', -1.23)
def test_new_state(self): policy = QTablePolicy() self.assertTrue(policy.is_new_state(state='s1')) policy.initialize_state(state='s1', available_actions={'a(1)', 'a(2)'}) self.assertFalse(policy.is_new_state('s1'))
def test_optimal_value_for(self): planner_policy = MagicMock() qtable_policy = QTablePolicy() random_policy = RandomPolicy() policy = PlanningExploringStartsPolicy(planner_policy, random_policy, qtable_policy) # Evaluation of a state-action pair should be the same as for the qtable policy. policy.initialize_state('s', {'a', 'b', 'c'}) policy.update('s', 'a', 1.23) policy.update('s', 'b', -5.43) policy.update('s', 'c', 0.03) self.assertEqual(1.23, policy.optimal_value_for('s'))
def test_episode_time_limit(self): # If a time limit is set, the control algorithm should stop after reaching that limit. target_policy = QTablePolicy() behavior_policy = GuidedPolicy(['move(right)', 'move(left)'] * 100) control = OffPolicyControl(target_policy, behavior_policy) control.policy_update_after_step = MagicMock() mdp = VacuumCleanerWorldBuilder().build_mdp() mdp.transition = MagicMock(return_value=('some_next_state', -1)) control.learn_episode(mdp, step_limit=3) self.assertEqual(3, len(control.policy_update_after_step.mock_calls)) self.assertEqual(3, len(mdp.transition.mock_calls))
def test_is_new_state(self): qtable_policy = QTablePolicy() random_policy = RandomPolicy() mdp_builder = VacuumCleanerWorldBuilder() mdp = mdp_builder.build_mdp() planner_policy = PlannerPolicy(planning_horizon=1, mdp_builder=mdp_builder) policy = PlanningExploringStartsPolicy(planner_policy, random_policy, qtable_policy) self.assertTrue(policy.is_new_state(state='s1')) policy.initialize_state(state='s1', available_actions={'a(1)', 'a(2)'}) self.assertFalse(policy.is_new_state('s1'))
def test_terminal_state(self): policy = QTablePolicy() policy.initialize_state( 's0', set()) # s0 is terminal because it has no available actions. action = policy.suggest_action_for_state('s0') self.assertIsNone(action) value = policy.value_for('s0', None) self.assertEqual(0, value) # Terminal states always have a value of zero.
def test_monte_carlo_sgd_control_1(self): target_policy = QTablePolicy() behavior_policy = GuidedPolicy(['vacuum', 'move(right)', 'vacuum']) control = MonteCarloSGDControl(target_policy, alpha=0.4) # Even though this Monte-Carlo control is on-policy, we "cheat" for testing purposes # and give it a separate behavior policy. control.behavior_policy = behavior_policy mdp = VacuumCleanerWorldBuilder().build_mdp() control.learn_episode(mdp) s0 = frozenset({'robot(left)', 'dirty(left)', 'dirty(right)'}) self.assertEqual(0.4 * 97, target_policy.value_for(s0, 'vacuum')) self.assertEqual(0, target_policy.value_for(s0, 'move(right)')) s1 = frozenset({'robot(left)', 'dirty(right)'}) self.assertEqual(0.4 * 98, target_policy.value_for(s1, 'move(right)')) s2 = frozenset({'robot(right)', 'dirty(right)'}) self.assertEqual(0.4 * 99, target_policy.value_for(s2, 'vacuum')) self.assertEqual(0, target_policy.value_for(s2, 'move(left)'))
def test_qlearning_reversed_update_control_1(self): target_policy = QTablePolicy() behavior_policy = GuidedPolicy(['vacuum', 'move(right)', 'vacuum'] * 2) control = QLearningReversedUpdateControl(target_policy, behavior_policy, alpha=0.2) mdp_builder = VacuumCleanerWorldBuilder() mdp = mdp_builder.build_mdp() control.learn_episode(mdp) lr = 0.2 s2_val = lr * 99 s1_val = lr * (-1 + s2_val) s0_val = lr * (-1 + s1_val) s0 = frozenset({'robot(left)', 'dirty(left)', 'dirty(right)'}) self.assertEqual(s0_val, target_policy.value_for(s0, 'vacuum')) self.assertEqual(0, target_policy.value_for(s0, 'move(right)')) s1 = frozenset({'robot(left)', 'dirty(right)'}) self.assertEqual(s1_val, target_policy.value_for(s1, 'move(right)')) s2 = frozenset({'robot(right)', 'dirty(right)'}) self.assertEqual(s2_val, target_policy.value_for(s2, 'vacuum')) self.assertEqual(0, target_policy.value_for(s2, 'move(left)')) # After being guided to the goal, the target policy should know the way to the goal. test_mdp = mdp_builder.build_mdp() self.assertSetEqual(s0, test_mdp.state) self.assertEqual( 'vacuum', target_policy.suggest_action_for_state(test_mdp.state)) test_mdp.transition('vacuum') self.assertEqual( 'move(right)', target_policy.suggest_action_for_state(test_mdp.state)) test_mdp.transition('move(right)') self.assertEqual( 'vacuum', target_policy.suggest_action_for_state(test_mdp.state)) test_mdp.transition('vacuum') self.assertEqual( None, target_policy.suggest_action_for_state(test_mdp.state)) # Repeat with a second mdp and exactly the same actions. # The value estimates should propagate. mdp = VacuumCleanerWorldBuilder().build_mdp() control.learn_episode(mdp) self.assertSetEqual(frozenset({'robot(right)'}), mdp.state) s2_val += lr * (99 + 0 - s2_val) s1_val += lr * (-1 + s2_val - s1_val) s0_val += lr * (-1 + s1_val - s0_val) s0 = frozenset({'robot(left)', 'dirty(left)', 'dirty(right)'}) self.assertEqual(s0_val, target_policy.value_for(s0, 'vacuum')) self.assertEqual(0, target_policy.value_for(s0, 'move(right)')) s1 = frozenset({'robot(left)', 'dirty(right)'}) self.assertEqual(s1_val, target_policy.value_for(s1, 'move(right)')) s2 = frozenset({'robot(right)', 'dirty(right)'}) self.assertEqual(s2_val, target_policy.value_for(s2, 'vacuum')) self.assertEqual(0, target_policy.value_for(s2, 'move(left)'))
def test_tie_breaking(self): policy = QTablePolicy() policy.initialize_state('s0', ['a1', 'a2', 'a3', 'a4', 'a5', 'a6']) policy.update(state='s0', action='a1', delta=5.2) policy.update(state='s0', action='a2', delta=3.2) policy.update(state='s0', action='a3', delta=5.2) policy.update(state='s0', action='a4', delta=-6.9) policy.update(state='s0', action='a6', delta=5.2) # Multiple actions have the same state-value-estimate: self.assertEqual(5.2, policy.value_for('s0', 'a1')) self.assertEqual(5.2, policy.value_for('s0', 'a3')) self.assertEqual(5.2, policy.value_for('s0', 'a6')) # When choosing the greedy action, this tie will be broken randomly. # We test this by creating a mock `random.choice` and check if the # random choice is made between the correct options {a1, a3, a6}. mocked_random_choice = MagicMock(return_value='a6') with patch('random.choice', mocked_random_choice): a0 = policy.suggest_action_for_state('s0') # Result is determined by the mocked "random" choice self.assertEqual('a6', a0) # The random choice should only happen between optimal actions. mocked_random_choice.assert_called_with(['a1', 'a3', 'a6'])
def test_update(self): current_state = 's0' available_actions = {'a1', 'a2'} policy = QTablePolicy() policy.initialize_state('s0', {'a1', 'a2'}) # An update of 5 for (s0, a2) should make a2 the greedy choice. # TODO Is this update an "abstract" one or something fixed like "reward"??? policy.update(state='s0', action='a2', delta=5.0) self.assertEqual(0.0, policy.value_for('s0', 'a1')) self.assertEqual(5.0, policy.value_for('s0', 'a2')) self.assertEqual('a2', policy.suggest_action_for_state('s0')) # An update of 4 for (s0, a1) should keep a2 as the greedy choice. policy.update(state='s0', action='a1', delta=4.0) self.assertEqual(4.0, policy.value_for('s0', 'a1')) self.assertEqual(5.0, policy.value_for('s0', 'a2')) self.assertEqual('a2', policy.suggest_action_for_state('s0')) # An update of -2 for (s0, a2) should make a1 the greedy choice. policy.update(state='s0', action='a2', delta=-2.6) self.assertEqual(4.0, policy.value_for('s0', 'a1')) self.assertEqual(2.4, policy.value_for('s0', 'a2')) self.assertEqual('a1', policy.suggest_action_for_state('s0'))
def test_qlearning_control_1(self): target_policy = QTablePolicy() behavior_policy = GuidedPolicy(['vacuum', 'move(right)', 'vacuum'] * 2) control = QLearningControl(target_policy, behavior_policy, alpha=0.3) mdp = VacuumCleanerWorldBuilder().build_mdp() control.learn_episode(mdp) s0 = frozenset({'robot(left)', 'dirty(left)', 'dirty(right)'}) self.assertEqual(-0.3, target_policy.value_for(s0, 'vacuum')) self.assertEqual(0, target_policy.value_for(s0, 'move(right)')) s1 = frozenset({'robot(left)', 'dirty(right)'}) self.assertEqual(-0.3, target_policy.value_for(s1, 'move(right)')) s2 = frozenset({'robot(right)', 'dirty(right)'}) self.assertEqual(99 * 0.3, target_policy.value_for(s2, 'vacuum')) self.assertEqual(0, target_policy.value_for(s2, 'move(left)')) # Repeat with a second mdp and exactly the same actions. # The value estimates should propagate. mdp = VacuumCleanerWorldBuilder().build_mdp() control.learn_episode(mdp) s0 = frozenset({'robot(left)', 'dirty(left)', 'dirty(right)'}) self.assertEqual(-0.3 - 0.3, target_policy.value_for(s0, 'vacuum')) self.assertEqual(0, target_policy.value_for(s0, 'move(right)')) s1 = frozenset({'robot(left)', 'dirty(right)'}) self.assertEqual(-0.3 + 0.3 * (-1 + 99 * 0.3 - (-0.3)), target_policy.value_for(s1, 'move(right)')) s2 = frozenset({'robot(right)', 'dirty(right)'}) self.assertEqual(99 * 0.3 + 0.3 * (99 + 0 - 99 * 0.3), target_policy.value_for(s2, 'vacuum')) self.assertEqual(0, target_policy.value_for(s2, 'move(left)'))
def test_monte_carlo_control_1(self): target_policy = QTablePolicy() behavior_policy_1 = GuidedPolicy([ 'move(right)', 'move(left)', 'move(right)', 'move(left)', 'vacuum', 'move(right)', 'vacuum' ]) behavior_policy_2 = GuidedPolicy( ['move(right)', 'vacuum', 'move(left)', 'vacuum']) control = FirstVisitMonteCarloControl(target_policy) # Even though this Monte-Carlo control is on-policy, we "cheat" for testing purposes # and give it a separate behavior policy. control.behavior_policy = behavior_policy_1 mdp = VacuumCleanerWorldBuilder().build_mdp() control.learn_episode(mdp) s0 = frozenset({'robot(left)', 'dirty(left)', 'dirty(right)'}) self.assertEqual(97, target_policy.value_for(s0, 'vacuum')) self.assertEqual(93, target_policy.value_for( s0, 'move(right)')) # Only first visit counts s1 = frozenset({'robot(left)', 'dirty(right)'}) self.assertEqual(98, target_policy.value_for(s1, 'move(right)')) s2 = frozenset({'robot(right)', 'dirty(right)'}) self.assertEqual(99, target_policy.value_for(s2, 'vacuum')) self.assertEqual(0, target_policy.value_for(s2, 'move(left)')) s3 = frozenset({'robot(right)', 'dirty(left)', 'dirty(right)'}) self.assertEqual(0, target_policy.value_for(s3, 'vacuum')) self.assertEqual(94, target_policy.value_for(s3, 'move(left)')) # Let's try a second episode with a different route # Even though this Monte-Carlo control is on-policy, we "cheat" for testing purposes # and give it a separate behavior policy. control.behavior_policy = behavior_policy_2 mdp = VacuumCleanerWorldBuilder().build_mdp() control.learn_episode(mdp) s0 = frozenset({'robot(left)', 'dirty(left)', 'dirty(right)'}) self.assertEqual(97, target_policy.value_for(s0, 'vacuum')) self.assertEqual((93 + 96) / 2.0, target_policy.value_for(s0, 'move(right)')) s1 = frozenset({'robot(left)', 'dirty(right)'}) self.assertEqual(98, target_policy.value_for(s1, 'move(right)')) s2 = frozenset({'robot(right)', 'dirty(right)'}) self.assertEqual(99, target_policy.value_for(s2, 'vacuum')) self.assertEqual(0, target_policy.value_for(s2, 'move(left)')) s3 = frozenset({'robot(right)', 'dirty(left)', 'dirty(right)'}) self.assertEqual(97, target_policy.value_for(s3, 'vacuum')) self.assertEqual(94, target_policy.value_for(s3, 'move(left)'))
def test_optimal_value_for(self): policy = QTablePolicy() policy.initialize_state('s', {'a', 'b', 'c'}) policy.update('s', 'a', -1) policy.update('s', 'b', 0) policy.update('s', 'c', 10) self.assertEqual(10, policy.optimal_value_for('s'))