def initExperiment(alg, optimistic=True): env = Maze(envmatrix, (7, 7)) # create task task = MDPMazeTask(env) # create value table and initialize with ones table = ActionValueTable(81, 4) if optimistic: table.initialize(1.) else: table.initialize(0.) # create agent with controller and learner - use SARSA(), Q() or QLambda() here learner = alg() # standard exploration is e-greedy, but a different type can be chosen as well # learner.explorer = BoltzmannExplorer() agent = LearningAgent(table, learner) agent.batchMode = False experiment = Experiment(task, agent) experiment.allRewards = [] return experiment
def setup_RL(): # create the maze with walls (1) envmatrix = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 0, 0, 1, 0, 0, 0, 0, 1], [1, 0, 0, 1, 0, 0, 1, 0, 1], [1, 0, 0, 1, 0, 0, 1, 0, 1], [1, 0, 0, 1, 0, 1, 1, 0, 1], [1, 0, 0, 0, 0, 0, 1, 0, 1], [1, 1, 1, 1, 1, 1, 1, 0, 1], [1, 0, 0, 0, 0, 0, 0, 0, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1]]) env = Maze(envmatrix, (7, 7)) # create task task = MDPMazeTask(env) # create value table and initialize with ones table = ActionValueTable(81, 4) table.initialize(0.) # create agent with controller and learner - use SARSA(), Q() or QLambda() here # learner = Q() learner = SARSA() # create agent agent = LearningAgent(table, learner) # create experiment experiment = Experiment(task, agent) return experiment, agent, table
def maze(): # import sys, time pylab.gray() pylab.ion() # The goal appears to be in the upper right structure = [ '!!!!!!!!!!', '! ! ! ! !', '! !! ! ! !', '! ! !', '! !!!!!! !', '! ! ! !', '! ! !!!! !', '! !', '! !!!!! !', '! ! !', '!!!!!!!!!!', ] structure = np.array([[ord(c)-ord(' ') for c in row] for row in structure]) shape = np.array(structure.shape) environment = Maze(structure, tuple(shape - 2)) controller = ActionValueTable(shape.prod(), 4) controller.initialize(1.) learner = Q() agent = LearningAgent(controller, learner) task = MDPMazeTask(environment) experiment = Experiment(task, agent) for i in range(100): experiment.doInteractions(100) agent.learn() agent.reset() # 4 actions, 81 locations/states (9x9 grid) # max(1) gives/plots the biggest objective function value for that square pylab.pcolor(controller.params.reshape(81, 4).max(1).reshape(9, 9)) pylab.draw() # (0, 0) is upper left and (0, N) is upper right, so flip matrix upside down to match NESW action order greedy_policy = np.argmax(controller.params.reshape(shape.prod(), 4), 1) greedy_policy = np.flipud(np.array(list('NESW'))[greedy_policy].reshape(shape)) maze = np.flipud(np.array(list(' #'))[structure]) print('Maze map:') print('\n'.join(''.join(row) for row in maze)) print('Greedy policy:') print('\n'.join(''.join(row) for row in greedy_policy))
def test_maze(): # simplified version of the reinforcement learning tutorial example structure = [ list('!!!!!!!!!!'), list('! ! ! ! !'), list('! !! ! ! !'), list('! ! !'), list('! !!!!!! !'), list('! ! ! !'), list('! ! !!!! !'), list('! !'), list('! !!!!! !'), list('! ! !'), list('!!!!!!!!!!'), ] structure = np.array([[ord(c) - ord(' ') for c in row] for row in structure]) shape = np.array(structure.shape) environment = Maze(structure, tuple(shape - 2)) controller = ActionValueTable(shape.prod(), 4) controller.initialize(1.) learner = Q() agent = LearningAgent(controller, learner) task = MDPMazeTask(environment) experiment = Experiment(task, agent) for i in range(30): experiment.doInteractions(30) agent.learn() agent.reset() controller.params.reshape(shape.prod(), 4).max(1).reshape(*shape) # (0, 0) is upper left and (0, N) is upper right, so flip matrix upside down to match NESW action order greedy_policy = np.argmax(controller.params.reshape(shape.prod(), 4), 1) greedy_policy = np.flipud( np.array(list('NESW'))[greedy_policy].reshape(shape)) maze = np.flipud(np.array(list(' #'))[structure]) print('Maze map:') print('\n'.join(''.join(row) for row in maze)) print('Greedy policy:') print('\n'.join(''.join(row) for row in greedy_policy)) assert '\n'.join( ''.join(row) for row in greedy_policy) == 'NNNNN\nNSNNN\nNSNNN\nNEENN\nNNNNN'
One of these environments is the maze environment, which we will use for this tutorial. It creates a labyrinth with free fields, walls, and an goal point. An agent can move over the free fields and needs to find the goal point. Let's define the maze structure, a simple 2D numpy array, where 1 is a wall and 0 is a free field: """ structure = array([[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 0, 0, 1, 0, 0, 0, 0, 1], [1, 0, 0, 1, 0, 0, 1, 0, 1], [1, 0, 0, 1, 0, 0, 1, 0, 1], [1, 0, 0, 1, 0, 1, 1, 0, 1], [1, 0, 0, 0, 0, 0, 1, 0, 1], [1, 1, 1, 1, 1, 1, 1, 0, 1], [1, 0, 0, 0, 0, 0, 0, 0, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1]]) """ Then we create the environment with the structure as first parameter and the goal field tuple as second parameter: """ environment = Maze(structure, (7, 7)) """ Next, we need an agent. The agent is where the learning happens. It can interact with the environment with its .getAction() and .integrateObservation() methods. The agent itself consists of a controller, which maps states to actions, a learner, which updates the controller parameters according to the interaction it had with the world, and an explorer, which adds some explorative behaviour to the actions. All standard agents already have a default explorer, so we don't need to take care of that in this tutorial. The controller in PyBrain is a module, that takes states as inputs and transforms them into actions. For value-based methods, like the Q-Learning algorithm we will use here, we need a module that implements
""" a filtered mapping to getSample of the underlying environment. """ obs = array([ self.env.perseus[0] * self.env.mazeTable.shape[0] + self.env.perseus[1] ]) return obs # create environment envmatrix = array([[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 0, 0, 1, 0, 0, 0, 0, 1], [1, 0, 0, 1, 0, 0, 1, 0, 1], [1, 0, 0, 1, 0, 0, 1, 0, 1], [1, 0, 0, 1, 0, 1, 1, 0, 1], [1, 0, 0, 0, 0, 0, 1, 0, 1], [1, 1, 1, 1, 1, 1, 1, 0, 1], [1, 0, 0, 0, 0, 0, 0, 0, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1]]) env = Maze(envmatrix, (7, 7)) # create task task = MazeTask(env) # create the ActionValueTable table = ActionValueTable(81, 4) table.initialize(1) # create agent with controller and learner agent = EpsilonGreedyAgent(table, QLambda(4)) experiment = ContinuousExperiment(task, agent) pylab.gray() pylab.ion()
def Py_Brain(): ############################ # pybrain ############################ import matplotlib as mpl import matplotlib.pyplot as plt from matplotlib.colors import ListedColormap import itertools from scipy import linalg from pybrain.rl.environments.mazes import Maze, MDPMazeTask from pybrain.rl.learners.valuebased import ActionValueTable from pybrain.rl.agents import LearningAgent from pybrain.rl.learners import Q, SARSA from pybrain.rl.experiments import Experiment from pybrain.rl.environments import Task import pylab #pylab.gray() #pylab.ion() ''' structure = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 0, 0, 1, 0, 0, 0, 0, 1], [1, 0, 0, 1, 0, 0, 1, 0, 1], [1, 0, 0, 1, 0, 0, 1, 0, 1], [1, 0, 0, 1, 0, 1, 1, 0, 1], [1, 0, 0, 0, 0, 0, 1, 0, 1], [1, 1, 1, 1, 1, 1, 1, 0, 1], [1, 0, 0, 0, 0, 0, 0, 0, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1]]) ''' structure = np.array([[1, 1, 1, 1, 1], [1, 1, 0, 0, 1], [1, 1, 0, 1, 1], [1, 0, 0, 1, 1], [1, 1, 1, 1, 1]]) num_states = int(structure.shape[0]*structure.shape[1]) SQRT = int(math.sqrt(num_states)) #print structure.item((1, 3)) #environment = Maze(structure, (7, 7)) #second parameter is goal field tuple environment = Maze(structure, (1, 3)) #second parameter is goal field tuple print type(environment) print environment # Standard maze environment comes with the following 4 actions: # North, South, East, West controller = ActionValueTable(num_states, 4) #[N, S, E, W] controller.initialize(1) learner = Q() agent = LearningAgent(controller, learner) np.not_equal(agent.lastobs, None) task = MDPMazeTask(environment) experiment = Experiment(task, agent) #while True: for x in range(4): print x experiment.doInteractions(10) agent.learn() agent.reset() pylab.pcolor(controller.params.reshape(num_states,4).max(1).reshape(SQRT,SQRT)) pylab.draw() #pylab.show() name='MAZE' plt.savefig(str(name)+'_PLOT.png') plt.close()
goal point. An agent can move over the free fields and needs to find the goal point. Let's define the maze structure, a simple 2D numpy array, where 1 is a wall and 0 is a free field: """ structure = array([[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 0, 0, 1, 0, 0, 0, 0, 1], [1, 0, 0, 1, 0, 0, 1, 0, 1], [1, 0, 0, 1, 0, 0, 1, 0, 1], [1, 0, 0, 1, 0, 1, 1, 0, 1], [1, 0, 0, 0, 0, 0, 1, 0, 1], [1, 1, 1, 1, 1, 1, 1, 0, 1], [1, 0, 0, 0, 0, 0, 0, 0, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1]]) """ Then we create the environment with the structure as first parameter and the goal field tuple as second parameter: """ x_goal = 1 y_goal = 2 environment = Maze(structure, (x_goal, y_goal)) """ Next, we need an agent. The agent is where the learning happens. It can interact with the environment with its .getAction() and .integrateObservation() methods. The agent itself consists of a controller, which maps states to actions, a learner, which updates the controller parameters according to the interaction it had with the world, and an explorer, which adds some explorative behaviour to the actions. All standard agents already have a default explorer, so we don't need to take care of that in this tutorial. The controller in PyBrain is a module, that takes states as inputs and transforms them into actions. For value-based methods, like the Q-Learning algorithm we will use here, we need a module that implements the ActionValueInterface. There are currently two modules in PyBrain that do this: The ActionValueTable for discrete actions and the