def test_simpletmaze(): """Test the SimpleTMaze environment.""" env = SimpleTMaze(2, 1, -1) env.start_new_episode() assert env.get_state() == State(x=0, y=0, symbol=0, goal_x=-1) expected_steps = [ RLTestStep( State(x=0, y=0, symbol=0), [Action('up')], Action('up'), -1, ), RLTestStep( State(x=0, y=1, symbol=-1), [Action('up')], Action('up'), -1, ), RLTestStep( State(x=0, y=2, symbol=0), [Action('left'), Action('right')], Action('left'), 10, ), RLTestStep(State(x=-1, y=2, symbol=0), [], None, None), ] for expected in expected_steps: assert env.get_observation() == expected.observation assert set(env.get_actions()) == set(expected.actions) if expected.action is not None: reward = env.react(expected.action) assert reward == expected.reward
def get_actions(self): # noqa: D102 if self.row == self.col == 0: return [] else: return [ Action('up'), Action('down'), Action('left'), Action('right'), Action('upleft'), Action('upright'), Action('downleft'), Action('downright'), ]
def get_actions(self): # noqa: D102 if self.index == -1: return [] else: return [Action(str(i)) for i in range(-1, size * size)]
def test_memory_architecture(): """Test the memory architecture meta-environment.""" class TestEnv(Environment): """A simple environment with a single string state.""" def __init__(self, size, index=0): """Initialize the TestEnv. Arguments: size (int): The length of one side of the square. index (int): The initial int. """ super().__init__() self.size = size self.init_index = index self.index = self.init_index def get_state(self): # noqa: D102 return State(index=self.index) def get_observation(self): # noqa: D102 return State(index=self.index) def get_actions(self): # noqa: D102 if self.index == -1: return [] else: return [Action(str(i)) for i in range(-1, size * size)] def reset(self): # noqa: D102 self.start_new_episode() def start_new_episode(self): # noqa: D102 self.index = self.init_index def react(self, action): # noqa: D102 assert action in self.get_actions() if action.name != 'no-op': self.index = int(action.name) if self.end_of_episode(): return 100 else: return -1 def visualize(self): # noqa: D102 pass size = 5 env = memory_architecture(TestEnv)( # memory architecture knowledge_store=NaiveDictKB(), # TestEnv size=size, index=0, ) env.start_new_episode() for i in range(size * size): env.add_to_ltm(index=i, row=(i // size), col=(i % size)) # test observation assert env.get_observation() == State( perceptual_index=0, ), env.get_observation() # test actions assert (set(env.get_actions()) == set([ *(Action(str(i)) for i in range(-1, size * size)), Action('copy', src_buf='perceptual', src_attr='index', dst_buf='query', dst_attr='index'), ])), set(env.get_actions()) # test pass-through reaction reward = env.react(Action('9')) assert env.get_observation() == State( perceptual_index=9, ), env.get_observation() assert reward == -1, reward # query test env.react( Action('copy', src_buf='perceptual', src_attr='index', dst_buf='query', dst_attr='index')) assert env.get_observation() == State( perceptual_index=9, query_index=9, retrieval_index=9, retrieval_row=1, retrieval_col=4, ), env.get_observation() # query with no results env.react( Action('copy', src_buf='retrieval', src_attr='row', dst_buf='query', dst_attr='row')) env.react(Action('0')) env.react( Action('copy', src_buf='perceptual', src_attr='index', dst_buf='query', dst_attr='index')) assert env.get_observation() == State( perceptual_index=0, query_index=0, query_row=1, ), env.get_observation() # delete test env.react(Action('delete', buf='query', attr='index')) assert env.get_observation() == State( perceptual_index=0, query_row=1, retrieval_index=5, retrieval_row=1, retrieval_col=0, ), env.get_observation() # next result test env.react(Action('next-result')) assert env.get_observation() == State( perceptual_index=0, query_row=1, retrieval_index=6, retrieval_row=1, retrieval_col=1, ), env.get_observation() # delete test env.react(Action('prev-result')) assert env.get_observation() == State( perceptual_index=0, query_row=1, retrieval_index=5, retrieval_row=1, retrieval_col=0, ), env.get_observation() # complete the environment reward = env.react(Action('-1')) assert env.end_of_episode() assert reward == 100, reward
def test_simpletmaze_gatingmemory(): """Test the gating memory meta-environment.""" env = gating_memory(SimpleTMaze)( num_memory_slots=1, reward=-0.05, length=2, hint_pos=1, ) env.start_new_episode() goal = env.get_state().goal_x assert env.get_state() == State(x=0, y=0, symbol=0, goal_x=goal, memory_0=None) expected_steps = [ RLTestStep( State(x=0, y=0, symbol=0, memory_0=None), [ Action('up'), Action('gate', slot=0, attribute='x'), Action('gate', slot=0, attribute='y'), Action('gate', slot=0, attribute='symbol'), ], Action('up'), -1, ), RLTestStep( State(x=0, y=1, symbol=goal, memory_0=None), [ Action('up'), Action('gate', slot=0, attribute='x'), Action('gate', slot=0, attribute='y'), Action('gate', slot=0, attribute='symbol'), ], Action('gate', slot=0, attribute='symbol'), -0.05, ), RLTestStep( State(x=0, y=1, symbol=goal, memory_0=goal), [ Action('up'), Action('gate', slot=0, attribute='x'), Action('gate', slot=0, attribute='y'), Action('gate', slot=0, attribute='symbol'), ], Action('up'), -1, ), RLTestStep( State(x=0, y=2, symbol=0, memory_0=goal), [ Action('left'), Action('right'), Action('gate', slot=0, attribute='x'), Action('gate', slot=0, attribute='y'), Action('gate', slot=0, attribute='symbol'), ], Action('right' if goal == -1 else 'left'), -10, ), RLTestStep( State(x=1 if goal == -1 else -1, y=2, symbol=0, memory_0=goal), [], None, None), ] for expected in expected_steps: assert env.get_observation() == expected.observation assert set(env.get_actions()) == set(expected.actions) if expected.action is not None: reward = env.react(expected.action) assert reward == expected.reward
def test_linear_agent(): """Test the linear approximation Q-learning agent.""" class InfiniteGridWorld(Environment, RandomMixin): """An infinite gridworld. Goal is (0, 0).""" def __init__(self, max_size, *args, **kwargs): super().__init__(*args, **kwargs) self.max_size = max_size self.row = 0 self.col = 0 def get_state(self): # noqa: D102 return State(row=self.row, col=self.col) def get_actions(self): # noqa: D102 if self.row == self.col == 0: return [] else: return [ Action('up'), Action('down'), Action('left'), Action('right'), Action('upleft'), Action('upright'), Action('downleft'), Action('downright'), ] def reset(self): # noqa: D102 self.start_new_episode() def start_new_episode(self): # noqa: D102 while self.row == self.col == 0: self.row = self.rng.randrange(-self.max_size, self.max_size + 1) self.col = self.rng.randrange(-self.max_size, self.max_size + 1) def react(self, action=None): # noqa: D102 assert action in self.get_actions() if 'up' in action.name: self.row -= 1 if 'down' in action.name: self.row += 1 if 'left' in action.name: self.col -= 1 if 'right' in action.name: self.col += 1 if self.row == self.col == 0: return 1 else: return 0 def visualize(self): # noqa: D102 raise NotImplementedError def feature_extractor(state, action=None): # pylint: disable = unused-argument return { 'row': (0 if state['row'] == 0 else copysign(1, state['row'])), 'col': (0 if state['col'] == 0 else copysign(1, state['col'])), } size = 1000 env = InfiniteGridWorld(max_size=size) agent = LinearQLearner( learning_rate=0.1, discount_rate=0.9, feature_extractor=feature_extractor, ) # train the agent for _ in range(50): env.start_new_episode() while not env.end_of_episode(): observation = env.get_observation() name = '' if observation['row'] < 0: name += 'down' elif observation['row'] > 0: name += 'up' if observation['col'] < 0: name += 'right' elif observation['col'] > 0: name += 'left' action = Action(name) action = agent.force_act(observation, action) reward = env.react(action) agent.observe_reward(env.get_observation(), reward) # test that the agent can finish within `2 * size` steps for _ in range(50): env.start_new_episode() step = 2 * size while step > 0 and not env.end_of_episode(): observation = env.get_observation() action = agent.act(observation, env.get_actions()) reward = env.react(action) step -= 1 assert env.end_of_episode()
def test_gridworld(): """Test the GridWorld environment.""" env = GridWorld( width=2, height=3, start=[0, 0], goal=[2, 0], ) env.start_new_episode() expected_steps = [ RLTestStep(State(row=0, col=0), [Action('down'), Action('right')], Action('right'), -1), RLTestStep(State(row=0, col=1), [Action('down'), Action('left')], Action('down'), -1), RLTestStep(State(row=1, col=1), [Action('up'), Action('down'), Action('left')], Action('down'), -1), RLTestStep(State(row=2, col=1), [Action('up'), Action('left')], Action('up'), -1), RLTestStep(State(row=1, col=1), [Action('up'), Action('down'), Action('left')], Action('left'), -1), RLTestStep( State(row=1, col=0), [Action('up'), Action('down'), Action('right')], Action('down'), 1), RLTestStep(State(row=2, col=0), [], None, None), ] for expected in expected_steps: assert env.get_observation() == expected.observation assert set(env.get_actions()) == set(expected.actions) if expected.action is not None: reward = env.react(expected.action) assert reward == expected.reward
def test_simpletmaze_fixedltm(): """Test the fixed LTM meta-environment.""" env = fixed_long_term_memory(SimpleTMaze)( num_wm_slots=1, num_ltm_slots=1, reward=-0.05, length=2, hint_pos=1, goal_x=1, ) env.start_new_episode() assert env.get_state() == State(x=0, y=0, symbol=0, goal_x=1, wm_0=None, ltm_0=None) expected_steps = [ RLTestStep( State(x=0, y=0, symbol=0, wm_0=None), [ Action('up'), Action('store', slot=0, attribute='x'), Action('store', slot=0, attribute='y'), Action('store', slot=0, attribute='symbol'), Action('retrieve', wm_slot=0, ltm_slot=0), ], Action('up'), -1, ), RLTestStep( State(x=0, y=1, symbol=1, wm_0=None), [ Action('up'), Action('store', slot=0, attribute='x'), Action('store', slot=0, attribute='y'), Action('store', slot=0, attribute='symbol'), Action('retrieve', wm_slot=0, ltm_slot=0), ], Action('store', slot=0, attribute='symbol'), -0.05, ), RLTestStep( State(x=0, y=1, symbol=1, wm_0=None), [ Action('up'), Action('store', slot=0, attribute='x'), Action('store', slot=0, attribute='y'), Action('store', slot=0, attribute='symbol'), Action('retrieve', wm_slot=0, ltm_slot=0), ], Action('up'), -1, ), RLTestStep( State(x=0, y=2, symbol=0, wm_0=None), [ Action('left'), Action('right'), Action('store', slot=0, attribute='x'), Action('store', slot=0, attribute='y'), Action('store', slot=0, attribute='symbol'), Action('retrieve', wm_slot=0, ltm_slot=0), ], Action('retrieve', wm_slot=0, ltm_slot=0), -0.05, ), RLTestStep( State(x=0, y=2, symbol=0, wm_0=1), [ Action('left'), Action('right'), Action('store', slot=0, attribute='x'), Action('store', slot=0, attribute='y'), Action('store', slot=0, attribute='symbol'), Action('retrieve', wm_slot=0, ltm_slot=0), ], Action('right'), 10, ), RLTestStep(State(x=1, y=2, symbol=0, wm_0=1), [], None, None), ] for expected in expected_steps: assert env.get_observation() == expected.observation assert set(env.get_actions()) == set(expected.actions) if expected.action is not None: reward = env.react(expected.action) assert reward == expected.reward