def xtest_learn(self): np.random.seed(1) states = [ SimpleGridState((1, 1)), SimpleGridState((2, 2)), SimpleGridState((3, 3)) ] rewards = [-1, -2] actions = [1, 2] episode = Episode(states, actions, rewards) grid_world = SimpleGridWorld() q_learner = build_learner(grid_world) q_learner.learn(episode) y = grid_world.action_value_function.on_list(episode.states) logging.info('Q Learning fitted targets are:') logging.info('\n' + str(y)) # The learning rate is set to 0.1 by default, so we can calculate # the expect result usign the actions and rewards above. expected = np.array([[0, -0.1, 0, 0], [0, 0, -0.2, 0], [0, 0, 0, 0]]) np.testing.assert_array_equal(expected, y)
def test_learn(self): np.random.seed(1) states = [ SimpleGridState((1, 1)), SimpleGridState((2, 2)), SimpleGridState((2, 3)) ] grid_world = SimpleGridWorld() q_learner = build_learner(grid_world) q_learner.learn(states) y = grid_world.action_value_function.on_list(states) logging.info('Q Learning fitted targets are:') logging.info('\n' + str(y)) # The learning rate is set to 0.1 by default, so we can calculate # the expect result usign the actions and rewards above. expected = np.array([ [-0.1, -0.1, -0.1, -0.1], # Loses 1 in all directions [-0.1, -0.1, -0.1, -0.1], # Loses 1 in all directions [-0.1, 0.0, -0.1, -0.1] # Loses 1 in all but the down direction ]) np.testing.assert_array_equal(expected, y)
def test_training(self): grid_world = CliffWorld() grid_world.policy.epsilon = 0.4 grid_world.action_value_function.learning_rate = 0.05 states = GridState.all() learner = build_learner(grid_world, learning_algo='sarsa') with Timer('training') as t: for i in range(1000): learner.learn(states) # Create the expected result that only consists of actions # on the expected path. The other numbers can be unpredictable # due to Sarsa's random sampling nan = np.nan expected = np.array( [[1, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], [1, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 0], [1, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 0], [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0]], dtype=np.float) actions = grid_world.get_greedy_action_grid() # Filter out values that are off the expected path actions = actions.astype(np.float) actions[np.isnan(expected)] = np.nan np.testing.assert_array_almost_equal(expected, actions)
def test_something(self): world = AntWorld() learner = build_learner(world, calculator_type='modelbasedstatemachine') states_list = StateList( [AntState(position=position) for position in range(10)]) for _ in range(1000): learner.learn(states_list, epochs=1) greedy_actions = calculate_greedy_actions(world) print(greedy_actions) expected_greedy_actions = r""" FINDING HOME : >>x<<<<<<< FINDING FOOD : >>>>>>>>x< """.strip() self.assertEqual(expected_greedy_actions, greedy_actions) action_values = world.calculate_action_values() print(action_values) expected_action_values = [ np.array([[8, 9], [8, 10], [np.nan, np.nan], [10, 8], [9, 7], [8, 6], [7, 5], [6, 5], [5, 4], [4, 3]]), np.array([[6, 8], [7, 9], [8, 10], [9, 11], [10, 12], [11, 13], [12, 14], [13, 15], [np.nan, np.nan], [15, 14]]), ] e0 = expected_action_values[0] r0 = action_values[0] r0[np.isnan(e0)] = np.nan np.testing.assert_array_almost_equal(e0, r0, decimal=0) e1 = expected_action_values[1] r1 = action_values[1] r1[np.isnan(e1)] = np.nan np.testing.assert_array_almost_equal(e1, r1, decimal=0)
def test_training(self): grid_world = CliffWorld() grid_world.action_value_function.learning_rate = 0.05 states = GridState.all() learner = build_learner(grid_world, learning_algo='qlearning') with Timer('training') as t: for i in range(500): learner.learn(states) value_grid = grid_world.get_value_grid() nan = np.nan expected = np.array([ [-13., nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], [-12., -11., -10., -9., -8., -7., -6., -5., -4., -3., -2., -1.], [-13., -12., -11., -10., -9., -8., -7., -6., -5., -4., -3., -2.], [-14., -13., -12., -11., -10., -9., -8., -7., -6., -5., -4., -3.], ]) np.testing.assert_array_almost_equal(expected, value_grid, decimal=1)
from rl.core.learning.learner import build_learner from rl.core.state import StateList from rl.environments.line_world.rl_system import AntWorld from rl.environments.line_world.rl_system import calculate_greedy_actions from rl.environments.line_world.state import AntState from rl.environments.line_world.constants import HOME_POSITION, FOOD_POSITION, FINDING_FOOD, FINDING_HOME world = AntWorld() learner = build_learner(world, calculator_type='modelbasedstatemachine') states = [AntState(position=position) for position in range(10)] states_list = StateList(states) initial_greedy_actions = calculate_greedy_actions(world) for _ in range(500): learner.learn(states_list, epochs=1) greedy_actions = calculate_greedy_actions(world) print('Initial Greedy Actions (should be random):') print(initial_greedy_actions) print( 'Optimised Greedy Actions (should point at home(%s) and food(%s) positions):' % (HOME_POSITION, FOOD_POSITION)) print(greedy_actions) action_values = world.calculate_action_values() print('Home is at position %s' % HOME_POSITION)
import textwrap from rl.core.learning.learner import build_learner from rl.environments.grid_world.cliff_world import CliffWorld, GridState from rl.lib.timer import Timer grid_world = CliffWorld() grid_world.action_value_function.learning_rate = 0.05 states = GridState.all() learner = build_learner(grid_world, learning_algo='qlearning') with Timer('training') as t: for i in range(500): print('Epoch %s' % i) learner.learn(states) print('=== Value Function ===') print(grid_world.get_value_grid()) print('=== Greedy Actions ===') greedy_actions = grid_world.get_greedy_action_grid_string() print(textwrap.indent(greedy_actions, ' '))
import textwrap from rl.core.experience import ExperienceGenerator from rl.core.learning.learner import build_learner from rl.core.learning.learning_system import LearningSystem from rl.environments.grid_world.cliff_world import CliffWorld from rl.lib.timer import Timer grid_world = CliffWorld() grid_world.policy.epsilon = 0.1 grid_world.action_value_function.learning_rate = 0.05 generator = ExperienceGenerator(grid_world) learner = build_learner(grid_world) learning_system = LearningSystem(learner, generator) with Timer('training') as t: learning_system.learn_many_times(1000) print('=== Value Function ===') print(grid_world.get_value_grid()) print('=== Greedy Actions ===') # print(grid_world.get_greedy_action_grid_string()) greedy_actions = grid_world.get_greedy_action_grid_string() print(textwrap.indent(greedy_actions, ' '))