def xtest_learn(self):

        np.random.seed(1)
        states = [
            SimpleGridState((1, 1)),
            SimpleGridState((2, 2)),
            SimpleGridState((3, 3))
        ]
        rewards = [-1, -2]
        actions = [1, 2]
        episode = Episode(states, actions, rewards)

        grid_world = SimpleGridWorld()

        q_learner = build_learner(grid_world)
        q_learner.learn(episode)

        y = grid_world.action_value_function.on_list(episode.states)
        logging.info('Q Learning fitted targets are:')
        logging.info('\n' + str(y))

        # The learning rate is set to 0.1 by default, so we can calculate
        # the expect result usign the actions and rewards above.
        expected = np.array([[0, -0.1, 0, 0], [0, 0, -0.2, 0], [0, 0, 0, 0]])

        np.testing.assert_array_equal(expected, y)
    def test_learn(self):
        np.random.seed(1)
        states = [
            SimpleGridState((1, 1)),
            SimpleGridState((2, 2)),
            SimpleGridState((2, 3))
        ]

        grid_world = SimpleGridWorld()

        q_learner = build_learner(grid_world)
        q_learner.learn(states)

        y = grid_world.action_value_function.on_list(states)
        logging.info('Q Learning fitted targets are:')
        logging.info('\n' + str(y))

        # The learning rate is set to 0.1 by default, so we can calculate
        # the expect result usign the actions and rewards above.
        expected = np.array([
            [-0.1, -0.1, -0.1, -0.1],  # Loses 1 in all directions
            [-0.1, -0.1, -0.1, -0.1],  # Loses 1 in all directions
            [-0.1, 0.0, -0.1, -0.1]  # Loses 1 in all but the down direction
        ])

        np.testing.assert_array_equal(expected, y)
Exemplo n.º 3
0
    def test_training(self):
        grid_world = CliffWorld()
        grid_world.policy.epsilon = 0.4
        grid_world.action_value_function.learning_rate = 0.05

        states = GridState.all()

        learner = build_learner(grid_world, learning_algo='sarsa')

        with Timer('training') as t:
            for i in range(1000):
                learner.learn(states)

        # Create the expected result that only consists of actions
        # on the expected path. The other numbers can be unpredictable
        # due to Sarsa's random sampling
        nan = np.nan
        expected = np.array(
            [[1, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
             [1, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 0],
             [1, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 0],
             [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0]],
            dtype=np.float)

        actions = grid_world.get_greedy_action_grid()

        # Filter out values that are off the expected path
        actions = actions.astype(np.float)
        actions[np.isnan(expected)] = np.nan

        np.testing.assert_array_almost_equal(expected, actions)
    def test_something(self):

        world = AntWorld()

        learner = build_learner(world,
                                calculator_type='modelbasedstatemachine')

        states_list = StateList(
            [AntState(position=position) for position in range(10)])

        for _ in range(1000):
            learner.learn(states_list, epochs=1)

        greedy_actions = calculate_greedy_actions(world)

        print(greedy_actions)
        expected_greedy_actions = r"""
FINDING HOME : >>x<<<<<<<
FINDING FOOD : >>>>>>>>x<
""".strip()
        self.assertEqual(expected_greedy_actions, greedy_actions)

        action_values = world.calculate_action_values()
        print(action_values)

        expected_action_values = [
            np.array([[8, 9], [8, 10], [np.nan, np.nan], [10, 8], [9, 7],
                      [8, 6], [7, 5], [6, 5], [5, 4], [4, 3]]),
            np.array([[6, 8], [7, 9], [8, 10], [9, 11], [10, 12], [11, 13],
                      [12, 14], [13, 15], [np.nan, np.nan], [15, 14]]),
        ]

        e0 = expected_action_values[0]
        r0 = action_values[0]
        r0[np.isnan(e0)] = np.nan
        np.testing.assert_array_almost_equal(e0, r0, decimal=0)

        e1 = expected_action_values[1]
        r1 = action_values[1]
        r1[np.isnan(e1)] = np.nan
        np.testing.assert_array_almost_equal(e1, r1, decimal=0)
Exemplo n.º 5
0
    def test_training(self):
        grid_world = CliffWorld()
        grid_world.action_value_function.learning_rate = 0.05

        states = GridState.all()

        learner = build_learner(grid_world, learning_algo='qlearning')

        with Timer('training') as t:
            for i in range(500):
                learner.learn(states)

        value_grid = grid_world.get_value_grid()
        nan = np.nan
        expected = np.array([
            [-13., nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
            [-12., -11., -10., -9., -8., -7., -6., -5., -4., -3., -2., -1.],
            [-13., -12., -11., -10., -9., -8., -7., -6., -5., -4., -3., -2.],
            [-14., -13., -12., -11., -10., -9., -8., -7., -6., -5., -4., -3.],
        ])
        np.testing.assert_array_almost_equal(expected, value_grid, decimal=1)
Exemplo n.º 6
0
from rl.core.learning.learner import build_learner
from rl.core.state import StateList
from rl.environments.line_world.rl_system import AntWorld
from rl.environments.line_world.rl_system import calculate_greedy_actions
from rl.environments.line_world.state import AntState
from rl.environments.line_world.constants import HOME_POSITION, FOOD_POSITION, FINDING_FOOD, FINDING_HOME

world = AntWorld()
learner = build_learner(world, calculator_type='modelbasedstatemachine')

states = [AntState(position=position) for position in range(10)]
states_list = StateList(states)

initial_greedy_actions = calculate_greedy_actions(world)

for _ in range(500):
    learner.learn(states_list, epochs=1)

greedy_actions = calculate_greedy_actions(world)

print('Initial Greedy Actions (should be random):')
print(initial_greedy_actions)

print(
    'Optimised Greedy Actions (should point at home(%s) and food(%s) positions):'
    % (HOME_POSITION, FOOD_POSITION))
print(greedy_actions)

action_values = world.calculate_action_values()

print('Home is at position %s' % HOME_POSITION)
import textwrap

from rl.core.learning.learner import build_learner
from rl.environments.grid_world.cliff_world import CliffWorld, GridState
from rl.lib.timer import Timer

grid_world = CliffWorld()
grid_world.action_value_function.learning_rate = 0.05

states = GridState.all()

learner = build_learner(grid_world, learning_algo='qlearning')

with Timer('training') as t:
    for i in range(500):
        print('Epoch %s' % i)
        learner.learn(states)

print('=== Value Function ===')
print(grid_world.get_value_grid())

print('=== Greedy Actions ===')
greedy_actions = grid_world.get_greedy_action_grid_string()
print(textwrap.indent(greedy_actions, ' '))
import textwrap

from rl.core.experience import ExperienceGenerator
from rl.core.learning.learner import build_learner
from rl.core.learning.learning_system import LearningSystem
from rl.environments.grid_world.cliff_world import CliffWorld
from rl.lib.timer import Timer

grid_world = CliffWorld()
grid_world.policy.epsilon = 0.1
grid_world.action_value_function.learning_rate = 0.05

generator = ExperienceGenerator(grid_world)
learner = build_learner(grid_world)

learning_system = LearningSystem(learner, generator)
with Timer('training') as t:
    learning_system.learn_many_times(1000)

print('=== Value Function ===')
print(grid_world.get_value_grid())

print('=== Greedy Actions ===')
# print(grid_world.get_greedy_action_grid_string())
greedy_actions = grid_world.get_greedy_action_grid_string()
print(textwrap.indent(greedy_actions, ' '))