def setUp(self): self.data = [Sample(np.array([0]), 0, 1, np.array([0])), Sample(np.array([1]), 0, -1, np.array([1]))] self.basis = ExactBasis([2], 1) self.policy = Policy(self.basis, .9, 0, np.zeros((2, )), Policy.TieBreakingStrategy.FirstWins)
def test_solver_uses_policy_and_data(self): """Test that the solver is passed the data and policy.""" data = [10] initial_policy = Policy(FakeBasis(1)) solver_stub = SolverParamStub(data, initial_policy) lspi.learn(solver_stub.data, solver_stub.policy, solver_stub, max_iterations=1)
def __init__(self, env, steps_per_episode): self.env = env self.obs_space = self.env.observation_space self.obs_space_size = self.env.observation_space.shape[0] self.loadNoSample = self.env.env.stabilization self.stab = True if self.stab: self.obs_space_size = self.env.observation_space.shape[0]-1 # For memory initialisation self.starting_episodes = 10 self.starting_steps = steps_per_episode self.max_size = 100000 self.memory = LSPIMemory(self.max_size) # define action_space self.min_vol = -9.0 # Minimal voltage to apply self.max_vol = 9.0 # Maximal voltage to apply self.size_action_space = 3 self.action_space = np.arange( self.min_vol, self.max_vol+1, (self.max_vol-self.min_vol)/(self.size_action_space-1), dtype=float) # Discrete action space self.action_space = torch.tensor(self.action_space) print(self.action_space) # Define Basis Function self.number_means = 3 self.rbf_means = self.number_means * self.obs_space_size self.gamma = 0.8 self.lstdq = LSTDQ(self.gamma, self.obs_space_size, self.stab) self.initial_policy = Policy(self.action_space, self.obs_space_size, self.number_means, self.obs_space, self.stab) self.changing_policy = copy(self.initial_policy) self.epsilon = 0.0001 # Using the policy self.test_episodes = 200 self.test_timesteps = steps_per_episode self.still_learning = False self.old_weights = torch.Tensor([]) # For Plotting and evaluation self.rew_overall = 0 self.rew_episodes_len = 10 self.rew_episodes = np.zeros(self.rew_episodes_len) self.rwd_episodes_array = np.empty(0) self.rwd_overall_array = np.empty(0)
def test_epsilon_stopping_condition(self): """Test if learning stops when distance is less than epsilon.""" with self.assertRaises(ValueError): lspi.learn(None, None, None, epsilon=0) epsilon_solver = EpsilonSolverStub(10**-21) lspi.learn(None, Policy(FakeBasis(1)), epsilon_solver, epsilon=10**-20, max_iterations=1000) self.assertEqual(epsilon_solver.num_calls, 1)
def test_max_iterations_stopping_condition(self): """Test if learning stops when max_iterations is reached.""" with self.assertRaises(ValueError): lspi.learn(None, None, None, max_iterations=0) max_iterations_solver = MaxIterationsSolverStub() lspi.learn(None, Policy(FakeBasis(1)), max_iterations_solver, epsilon=10**-200, max_iterations=10) self.assertEqual(max_iterations_solver.num_calls, 10)
def test_returns_policy_with_new_weights(self): """Test if the weights in the new policy differ and are not the same underlying numpy vector.""" initial_policy = Policy(FakeBasis(1)) weight_solver = WeightSolverStub(initial_policy.weights) new_policy = lspi.learn(None, initial_policy, weight_solver, max_iterations=1) self.assertEqual(weight_solver.num_calls, 1) self.assertFalse( np.may_share_memory(initial_policy.weights, new_policy)) self.assertNotEquals(id(initial_policy), id(new_policy)) np.testing.assert_array_almost_equal(new_policy.weights, weight_solver.weights)
class TestPolicy(TestCase): def create_policy(self, *args, **kwargs): return Policy(FakeBasis(5), *args, **kwargs) @staticmethod def list_has_duplicates(list, num_places=4): # verify that there are no duplicate q values. # round the q_values so that there are not small floating point # inconsistencies that lead to no duplicates being detected # Then make a set of the list. If there are no duplicates then the # cardinality of the set will match the length of the list rounded_list = map(lambda x: round(x, 4), list) return len(set(rounded_list)) < len(list) def setUp(self): self.poly_policy = Policy(OneDimensionalPolynomialBasis(1, 2), weights=np.array([1., 1, 2, 2])) self.state = np.array([-3.]) self.tie_weights = np.ones((4,)) def test_default_constructor(self): policy = self.create_policy() self.assertTrue(isinstance(policy.basis, FakeBasis)) self.assertAlmostEqual(policy.discount, 1.0) self.assertAlmostEqual(policy.explore, 0.0) self.assertEqual(policy.weights.shape, (1,)) self.assertEqual(policy.tie_breaking_strategy, Policy.TieBreakingStrategy.RandomWins) def test_full_constructor(self): policy = self.create_policy(.5, .1, np.array([1.]), Policy.TieBreakingStrategy.FirstWins) self.assertTrue(isinstance(policy.basis, FakeBasis)) self.assertAlmostEqual(policy.discount, .5) self.assertAlmostEqual(policy.explore, 0.1) np.testing.assert_array_almost_equal(policy.weights, np.array([1.])) self.assertEqual(policy.tie_breaking_strategy, Policy.TieBreakingStrategy.FirstWins) def test_discount_out_of_bounds(self): with self.assertRaises(ValueError): self.create_policy(discount=-1.0) with self.assertRaises(ValueError): self.create_policy(discount=1.1) def test_explore_out_of_bounds(self): with self.assertRaises(ValueError): self.create_policy(explore=-.01) with self.assertRaises(ValueError): self.create_policy(explore=1.1) def test_weight_basis_dimensions_mismatch(self): with self.assertRaises(ValueError): self.create_policy(weights=np.arange(2)) def test_copy(self): orig_policy = self.create_policy() policy_copy = copy(orig_policy) self.assertNotEqual(id(orig_policy), id(policy_copy)) self.assertEqual(orig_policy.num_actions, policy_copy.num_actions) self.assertEqual(orig_policy.discount, policy_copy.discount) self.assertEqual(orig_policy.explore, policy_copy.explore) np.testing.assert_array_almost_equal(orig_policy.weights, policy_copy.weights) self.assertNotEqual(id(orig_policy.weights), id(policy_copy.weights)) # verify that changing a weight in the original doesn't affect the copy orig_policy.weights[0] *= -1 # numpy doesn't have an assert if not equal method # so to do the inverse I'm asserting the two arrays are equal # and expecting the assertion to fail with self.assertRaises(AssertionError): np.testing.assert_array_almost_equal(orig_policy.weights, policy_copy.weights) def test_calc_q_value_unit_weights(self): q_value = self.poly_policy.calc_q_value(self.state, 0) self.assertAlmostEqual(q_value, -2.) def test_calc_q_value_non_unit_weights(self): q_value = self.poly_policy.calc_q_value(self.state, 1) self.assertAlmostEqual(q_value, -4.) def test_calc_q_value_negative_action(self): with self.assertRaises(IndexError): self.poly_policy.calc_q_value(self.state, -1) def test_calc_q_value_out_of_bounds_action(self): with self.assertRaises(IndexError): self.poly_policy.calc_q_value(self.state, 2) def test_calc_q_value_mismatched_state_dimensions(self): with self.assertRaises(ValueError): self.poly_policy.calc_q_value(np.ones((2,)), 0) def test_best_action_no_ties(self): q_values = [self.poly_policy.calc_q_value(self.state, action) for action in range(self.poly_policy.num_actions)] self.assertFalse(TestPolicy.list_has_duplicates(q_values)) best_action = self.poly_policy.best_action(self.state) self.assertEqual(best_action, 0) def test_best_action_with_ties_first_wins(self): self.poly_policy.weights = self.tie_weights self.poly_policy.tie_breaking_strategy = \ Policy.TieBreakingStrategy.FirstWins q_values = [self.poly_policy.calc_q_value(self.state, action) for action in range(self.poly_policy.num_actions)] self.assertTrue(TestPolicy.list_has_duplicates(q_values)) best_action = self.poly_policy.best_action(self.state) self.assertEqual(best_action, 0) def test_best_action_with_ties_last_wins(self): self.poly_policy.weights = self.tie_weights self.poly_policy.tie_breaking_strategy = \ Policy.TieBreakingStrategy.LastWins q_values = [self.poly_policy.calc_q_value(self.state, action) for action in range(self.poly_policy.num_actions)] self.assertTrue(TestPolicy.list_has_duplicates(q_values)) best_action = self.poly_policy.best_action(self.state) self.assertEqual(best_action, 1) def test_best_action_with_ties_random_wins(self): self.poly_policy.weights = self.tie_weights self.poly_policy.tie_breaking_strategy = \ Policy.TieBreakingStrategy.RandomWins q_values = [self.poly_policy.calc_q_value(self.state, action) for action in range(self.poly_policy.num_actions)] self.assertTrue(TestPolicy.list_has_duplicates(q_values)) # select the best action num_times times num_times = 10 best_actions = [self.poly_policy.best_action(self.state) for i in range(num_times)] # This test will fail if all of the actions selected either action 0 # or action 1. When all action 0 is selected the sum will be # equal to 0. When all action 1 is taken the sum will be equal to # num_times self.assertLess(int(sum(best_actions)), num_times) self.assertNotEqual(int(sum(best_actions)), 0) def test_best_action_mismatched_state_dimensions(self): with self.assertRaises(ValueError): self.poly_policy.best_action(np.ones((2,))) def test_select_action_random(self): # first verify there are no ties # this way we know the tie breaking strategy isn't introducing # the randomness q_values = [self.poly_policy.calc_q_value(self.state, action) for action in range(self.poly_policy.num_actions)] self.assertFalse(TestPolicy.list_has_duplicates(q_values)) self.poly_policy.explore = 1.0 self.poly_policy.tie_breaking_strategy = \ Policy.TieBreakingStrategy.FirstWins # this is set up to evaluate to no tie num_times = 10 best_actions = [self.poly_policy.select_action(self.state) for i in range(num_times)] self.assertNotEqual(sum(best_actions), 0) self.assertNotEqual(sum(best_actions), num_times) def test_select_action_deterministic(self): # first verify there are no ties # this way we know the tie breaking strategy isn't introducing # the randomness q_values = [self.poly_policy.calc_q_value(self.state, action) for action in range(self.poly_policy.num_actions)] self.assertFalse(TestPolicy.list_has_duplicates(q_values)) self.poly_policy.explore = 0.0 self.poly_policy.tie_breaking_strategy = \ Policy.TieBreakingStrategy.FirstWins # this is set up to evaluate to no tie num_times = 10 best_actions = [self.poly_policy.select_action(self.state) for i in range(num_times)] self.assertEqual(sum(best_actions), 0) def test_select_action_mismatched_state_dimensions(self): with self.assertRaises(ValueError): self.poly_policy.select_action(np.ones((2,))) def test_num_actions_getter(self): self.assertEqual(self.poly_policy.num_actions, self.poly_policy.basis.num_actions) self.poly_policy.basis.num_actions = 10 self.assertEqual(self.poly_policy.num_actions, self.poly_policy.basis.num_actions) def test_num_actions_setter(self): self.assertEqual(self.poly_policy.num_actions, self.poly_policy.basis.num_actions) self.poly_policy.num_actions = 10 self.assertEqual(self.poly_policy.num_actions, self.poly_policy.basis.num_actions)
def setUp(self): self.poly_policy = Policy(OneDimensionalPolynomialBasis(1, 2), weights=np.array([1., 1, 2, 2])) self.state = np.array([-3.]) self.tie_weights = np.ones((4,))
def create_policy(self, *args, **kwargs): return Policy(FakeBasis(5), *args, **kwargs)
from lspi.domains import Domain, ChainDomain from lspi.solvers import LSTDQSolver from lspi.policy import Policy from lspi.sample import Sample from lspi.basis_functions import FakeBasis, OneDimensionalPolynomialBasis if __name__ == '__main__': # data = [ # Sample(np.array([0]), 0, 1, np.array([0])), # Sample(np.array([1]), 0, -1, np.array([1]), True) # ] precondition_value = .3 initial_policy = Policy(OneDimensionalPolynomialBasis(3,2), .9, 0, tie_breaking_strategy=Policy.TieBreakingStrategy.FirstWins) # initial_policy = Policy(lspi.basis_functions.RadialBasisFunction(np.array([[0], [2], [4], [6], [8]]), .5, 2), .9, 0) sampling_policy = Policy(FakeBasis(2), .9, 1) solver = LSTDQSolver(precondition_value) # weights = solver.solve(data[:-1], initial_policy) domain = ChainDomain() samples = [] for i in range(1000): action = sampling_policy.select_action(domain.current_state()) samples.append(domain.apply_action(action)) learned_policy = lspi.learn(samples, initial_policy, solver) domain.reset() cumulative_reward = 0