def test_maze(): # simplified version of the reinforcement learning tutorial example structure = np.array([[1, 1, 1, 1, 1], [1, 0, 0, 0, 1], [1, 0, 1, 0, 1], [1, 0, 1, 0, 1], [1, 1, 1, 1, 1]]) shape = np.array(structure.shape) environment = Maze(structure, tuple(shape - 2)) controller = ActionValueTable(shape.prod(), 4) controller.initialize(1.) learner = Q() agent = LearningAgent(controller, learner) task = MDPMazeTask(environment) experiment = Experiment(task, agent) for i in range(30): experiment.doInteractions(30) agent.learn() agent.reset() controller.params.reshape(shape.prod(), 4).max(1).reshape(*shape) # (0, 0) is upper left and (0, N) is upper right, so flip matrix upside down to match NESW action order greedy_policy = np.argmax(controller.params.reshape(shape.prod(), 4),1) greedy_policy = np.flipud(np.array(list('NESW'))[greedy_policy].reshape(shape)) maze = np.flipud(np.array(list(' #'))[structure]) print('Maze map:') print('\n'.join(''.join(row) for row in maze)) print('Greedy policy:') print('\n'.join(''.join(row) for row in greedy_policy)) assert '\n'.join(''.join(row) for row in greedy_policy) == 'NNNNN\nNSNNN\nNSNNN\nNEENN\nNNNNN'
class Team(object): def __init__(self, living, task, learner = ENAC()): self.living = living self.task = task self.last_reward = 0 self.agent = LearningAgent(self.living.brain, learner) self.oldparams = self.living.brain.params def Interaction(self): self.agent.integrateObservation(self.task.getObservation()) self.task.performAction(self.agent.getAction()) self.last_reward = self.task.getReward() self.agent.giveReward(self.last_reward) finished = self.task.isFinished() if finished: #print task.cumreward self.agent.newEpisode() self.task.reset() return self.last_reward, finished def Learn(self, episodes = 1): self.agent.learn(episodes) self.agent.reset() newparams = self.living.brain.params.copy() #get_all_weights(eater.brain)[:] dif = 0 j = 0 for i in newparams: dif += (self.oldparams[j] - newparams[j])**2 j += 1 self.oldparams = newparams return dif
def q_learning_table(): controller = ActionValueTable(36, 4) learner = Q() controller.initialize(1.) agent = LearningAgent(controller, learner) score_list = [] turn_list = [] # neural側のトレーニング分 +100 for i in range(600): print_state(agent.module.getValue, 'table') score, turn = play(agent, 'table') score_list.append(score) turn_list.append(turn) agent.learn() agent.reset() print i, int(numpy.mean(score_list)), max(score_list), score, turn with open('./agent.dump', 'w') as f: pickle.dump(agent, f) with open('./score.dump', 'w') as f: pickle.dump([score_list, turn_list], f)
def train(): # Make the environment environment = TwentyFortyEightEnvironment() # The task is the game this time task = environment # Make the reinforcement learning agent (use a network because inputs are continuous) network = ActionValueNetwork(task.nSenses, task.nActions) # Use Q learning for updating the table (NFQ is for networks) learner = NFQ() learner.gamma = GAMMA agent = LearningAgent(network, learner) # Set up an experiment experiment = EpisodicExperiment(task, agent) # Train the Learner meanScores = [] for i in xrange(LEARNING_EPOCHS): experiment.doEpisodes(GAMES_PER_EPOCH) print "Iteration ", i, " With mean score ", task.meanScore, "Max block achieved ", environment.maxGameBlock meanScores.append(task.meanScore) agent.learn() agent.reset() params = {"learningEpochs": LEARNING_EPOCHS, "gamesPerEpoch": GAMES_PER_EPOCH, "gamma": GAMMA } return meanScores, params, agent
def main(): # if os.path.exists('./agent.dump'): # with open('./agent.dump') as f: # agent = pickle.load(f) # else: controller = ActionValueNetwork(9, 4) learner = NFQ() agent = LearningAgent(controller, learner) score_list = [] for i in range(10000): score = play(agent) score_list.append(score) # ここで, # TypeError: only length-1 arrays can be converted to Python scalars # pybrain/rl/learners/valuebased/q.py # => learnerをQからNFQにしたら行けた. # => http://stackoverflow.com/questions/23755927/pybrain-training-a-actionvaluenetwork-doesnt-properly-work #agent.learn() agent.reset() #data =[[0,0,0,0], [0,0,0,0], [0,0,0,2], [0,0,0,2]] data =[[0,0,2], [0,0,0], [0,0,2]] agent.integrateObservation(numpy.array(data).ravel()) move = agent.getAction() print i, int(numpy.mean(score_list)) , max(score_list), move with open('./agent.dump', 'w') as f: pickle.dump(agent, f) with open('./score.dump', 'w') as f: pickle.dump(score_list, f)
def q_learning_table(): controller = ActionValueTable(36, 4) learner = Q() controller.initialize(1.) agent = LearningAgent(controller, learner) score_list = [] turn_list = [] # neural側のトレーニング分 +100 for i in range(600): print_state(agent.module.getValue, 'table') score, turn = play(agent, 'table') score_list.append(score) turn_list.append(turn) agent.learn() agent.reset() print i, int(numpy.mean(score_list)) , max(score_list), score, turn with open('./agent.dump', 'w') as f: pickle.dump(agent, f) with open('./score.dump', 'w') as f: pickle.dump([score_list, turn_list], f)
class QAlgorithm: def Pause(self):#if menu says pause pause exicution while self.state == 1: time.sleep(.05) return True def Quit(self):#if menu says quit stop running self.process.terminate() return False def Start(self):#starts the Bot if self.process == None: self.runBot() #self.process = multiprocessing.Process(target=self.runBot, args= []) #self.process.start() return True def CheckState(self):#checks to see what state the menu says to be in if self.state == 0 : self.Start() elif self.state == 1: self.Pause() elif self.state == 2: self.Quit() def GameOver(self):#checks to see if state requires bot pause, quit or if the game is over return self.CheckState() or self.sr.checkEndGame(self.endBox,self.gameOver) def __init__(self,rewardBox,box,gameOver,endGame,scoreArea): self.reward = rewardBox self.bbox = box self.environment = TEnviroment(box)#Custom environment class if os.path.isfile("bot.txt"): self.controller = pickle.load(open("bot.txt","rb")) else: self.controller = ActionValueNetwork(50**2,4)#Arguments (framerate*maxPlaytime, Number of acitons) self.learner = Q() gf = {0:self.GameOver} self.agent = LearningAgent(self.controller, self.learner) self.task = TTask(self.environment,scoreArea,gf)#needs custom task self.experiment = EpisodicExperiment(self.task, self.agent) self.process = None self.endBox = endGame def runBot(self):#runes the bot for a single Episode self.experiment.doEpisodes() self.agent.learn() self.agent.reset() file = open("bot.txt","wb+") pickle.dump(self.controller,file)
def learn(self, number_of_iterations): learner = Q(0.2, 0.8) task = CartMovingTask(self.environment) self.controller = ActionValueTable( reduce(lambda x, y: x * y, map(lambda x: len(x), self.ranges)), self.force_granularity) self.controller.initialize(1.) agent = LearningAgent(self.controller, learner) experiment = Experiment(task, agent) for i in range(number_of_iterations): experiment.doInteractions(1) agent.learn() agent.reset() with open("test.pcl", "w+") as f: pickle.dump(self.controller, f)
def learn(self, number_of_iterations): learner = Q(0.2, 0.8) task = CartMovingTask(self.environment) self.controller = ActionValueTable( reduce(lambda x, y: x * y, map(lambda x: len(x), self.ranges)), self.force_granularity ) self.controller.initialize(1.0) agent = LearningAgent(self.controller, learner) experiment = Experiment(task, agent) for i in range(number_of_iterations): experiment.doInteractions(1) agent.learn() agent.reset() with open("test.pcl", "w+") as f: pickle.dump(self.controller, f)
def main(): # 2048の全ての状態を保存するのは無理でしょ. # 14^16通りの状態があるよね. #controller = ActionValueTable(16, 4) #learner = Q() #controller.initialize(1.) controller = ActionValueNetwork(16, 4) learner = NFQ() #learner._setExplorer(EpsilonGreedyExplorer(0.0)) agent = LearningAgent(controller, learner) score_list = [] for i in range(10000): # if os.path.exists('./agent.dump'): # with open('./agent.dump') as f: # agent = pickle.load(f) print i, 'playing ...' score = play(agent) score_list.append(score) # ここで, # TypeError: only length-1 arrays can be converted to Python scalars # pybrain/rl/learners/valuebased/q.py # => learnerをQからNFQにしたら行けた. # => http://stackoverflow.com/questions/23755927/pybrain-training-a-actionvaluenetwork-doesnt-properly-work print i, 'learning ...' agent.learn() agent.reset() print i, 'evaluate sample ...' data = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 2], [0, 0, 0, 2]] agent.integrateObservation(numpy.array(data).ravel()) move = agent.getAction() print " ", i, int( numpy.mean(score_list)), max(score_list), move if i % 20 == 0: print i, 'saving ...' with open('./agent.dump', 'w') as f: pickle.dump(agent, f) with open('./score.dump', 'w') as f: pickle.dump(score_list, f)
def maze(): # import sys, time pylab.gray() pylab.ion() # The goal appears to be in the upper right structure = [ '!!!!!!!!!!', '! ! ! ! !', '! !! ! ! !', '! ! !', '! !!!!!! !', '! ! ! !', '! ! !!!! !', '! !', '! !!!!! !', '! ! !', '!!!!!!!!!!', ] structure = np.array([[ord(c)-ord(' ') for c in row] for row in structure]) shape = np.array(structure.shape) environment = Maze(structure, tuple(shape - 2)) controller = ActionValueTable(shape.prod(), 4) controller.initialize(1.) learner = Q() agent = LearningAgent(controller, learner) task = MDPMazeTask(environment) experiment = Experiment(task, agent) for i in range(100): experiment.doInteractions(100) agent.learn() agent.reset() # 4 actions, 81 locations/states (9x9 grid) # max(1) gives/plots the biggest objective function value for that square pylab.pcolor(controller.params.reshape(81, 4).max(1).reshape(9, 9)) pylab.draw() # (0, 0) is upper left and (0, N) is upper right, so flip matrix upside down to match NESW action order greedy_policy = np.argmax(controller.params.reshape(shape.prod(), 4), 1) greedy_policy = np.flipud(np.array(list('NESW'))[greedy_policy].reshape(shape)) maze = np.flipud(np.array(list(' #'))[structure]) print('Maze map:') print('\n'.join(''.join(row) for row in maze)) print('Greedy policy:') print('\n'.join(''.join(row) for row in greedy_policy))
def maze(): # import sys, time pylab.gray() pylab.ion() # The goal appears to be in the upper right structure = [ "!!!!!!!!!!", "! ! ! ! !", "! !! ! ! !", "! ! !", "! !!!!!! !", "! ! ! !", "! ! !!!! !", "! !", "! !!!!! !", "! ! !", "!!!!!!!!!!", ] structure = np.array([[ord(c) - ord(" ") for c in row] for row in structure]) shape = np.array(structure.shape) environment = Maze(structure, tuple(shape - 2)) controller = ActionValueTable(shape.prod(), 4) controller.initialize(1.0) learner = Q() agent = LearningAgent(controller, learner) task = MDPMazeTask(environment) experiment = Experiment(task, agent) for i in range(100): experiment.doInteractions(100) agent.learn() agent.reset() # 4 actions, 81 locations/states (9x9 grid) # max(1) gives/plots the biggest objective function value for that square pylab.pcolor(controller.params.reshape(81, 4).max(1).reshape(9, 9)) pylab.draw() # (0, 0) is upper left and (0, N) is upper right, so flip matrix upside down to match NESW action order greedy_policy = np.argmax(controller.params.reshape(shape.prod(), 4), 1) greedy_policy = np.flipud(np.array(list("NESW"))[greedy_policy].reshape(shape)) maze = np.flipud(np.array(list(" #"))[structure]) print("Maze map:") print("\n".join("".join(row) for row in maze)) print("Greedy policy:") print("\n".join("".join(row) for row in greedy_policy))
def main(): rospy.init_node("lauron_reinforcement_learning") environment = RLEnvironment() dim_state = environment.joint_states.shape[0] num_actions = len(environment.actions) controller = ActionValueNetwork(dim_state, num_actions) learner = SARSA() agent = LearningAgent(controller, learner) task = RLTask(environment) experiment = Experiment(task, agent) episode_counter = 0 while True: print("Training episode {}".format(episode_counter)) experiment.doInteractions(NUM_INTERACTIONS) agent.learn() agent.reset() episode_counter += 1
def test_maze(): # simplified version of the reinforcement learning tutorial example structure = [ list('!!!!!!!!!!'), list('! ! ! ! !'), list('! !! ! ! !'), list('! ! !'), list('! !!!!!! !'), list('! ! ! !'), list('! ! !!!! !'), list('! !'), list('! !!!!! !'), list('! ! !'), list('!!!!!!!!!!'), ] structure = np.array([[ord(c) - ord(' ') for c in row] for row in structure]) shape = np.array(structure.shape) environment = Maze(structure, tuple(shape - 2)) controller = ActionValueTable(shape.prod(), 4) controller.initialize(1.) learner = Q() agent = LearningAgent(controller, learner) task = MDPMazeTask(environment) experiment = Experiment(task, agent) for i in range(30): experiment.doInteractions(30) agent.learn() agent.reset() controller.params.reshape(shape.prod(), 4).max(1).reshape(*shape) # (0, 0) is upper left and (0, N) is upper right, so flip matrix upside down to match NESW action order greedy_policy = np.argmax(controller.params.reshape(shape.prod(), 4), 1) greedy_policy = np.flipud( np.array(list('NESW'))[greedy_policy].reshape(shape)) maze = np.flipud(np.array(list(' #'))[structure]) print('Maze map:') print('\n'.join(''.join(row) for row in maze)) print('Greedy policy:') print('\n'.join(''.join(row) for row in greedy_policy)) assert '\n'.join( ''.join(row) for row in greedy_policy) == 'NNNNN\nNSNNN\nNSNNN\nNEENN\nNNNNN'
def main(): # 2048の全ての状態を保存するのは無理でしょ. # 14^16通りの状態があるよね. #controller = ActionValueTable(16, 4) #learner = Q() #controller.initialize(1.) controller = ActionValueNetwork(16, 4) learner = NFQ() #learner._setExplorer(EpsilonGreedyExplorer(0.0)) agent = LearningAgent(controller, learner) score_list = [] for i in range(10000): # if os.path.exists('./agent.dump'): # with open('./agent.dump') as f: # agent = pickle.load(f) print i, 'playing ...' score = play(agent) score_list.append(score) # ここで, # TypeError: only length-1 arrays can be converted to Python scalars # pybrain/rl/learners/valuebased/q.py # => learnerをQからNFQにしたら行けた. # => http://stackoverflow.com/questions/23755927/pybrain-training-a-actionvaluenetwork-doesnt-properly-work print i, 'learning ...' agent.learn() agent.reset() print i, 'evaluate sample ...' data =[[0,0,0,0], [0,0,0,0], [0,0,0,2], [0,0,0,2]] agent.integrateObservation(numpy.array(data).ravel()) move = agent.getAction() print " ",i, int(numpy.mean(score_list)) , max(score_list), move if i % 20 == 0: print i, 'saving ...' with open('./agent.dump', 'w') as f: pickle.dump(agent, f) with open('./score.dump', 'w') as f: pickle.dump(score_list, f)
def explore_maze(): # simplified version of the reinforcement learning tutorial example structure = [ list("!!!!!!!!!!"), list("! ! ! ! !"), list("! !! ! ! !"), list("! ! !"), list("! !!!!!! !"), list("! ! ! !"), list("! ! !!!! !"), list("! !"), list("! !!!!! !"), list("! ! !"), list("!!!!!!!!!!"), ] structure = np.array([[ord(c) - ord(" ") for c in row] for row in structure]) shape = np.array(structure.shape) environment = Maze(structure, tuple(shape - 2)) controller = ActionValueTable(shape.prod(), 4) controller.initialize(1.0) learner = Q() agent = LearningAgent(controller, learner) task = MDPMazeTask(environment) experiment = Experiment(task, agent) for i in range(30): experiment.doInteractions(30) agent.learn() agent.reset() controller.params.reshape(shape.prod(), 4).max(1).reshape(*shape) # (0, 0) is upper left and (0, N) is upper right, so flip matrix upside down to match NESW action order greedy_policy = np.argmax(controller.params.reshape(shape.prod(), 4), 1) greedy_policy = np.flipud(np.array(list("NESW"))[greedy_policy].reshape(shape)) maze = np.flipud(np.array(list(" #"))[structure]) print("Maze map:") print("\n".join("".join(row) for row in maze)) print("Greedy policy:") print("\n".join("".join(row) for row in greedy_policy)) assert "\n".join("".join(row) for row in greedy_policy) == "NNNNN\nNSNNN\nNSNNN\nNEENN\nNNNNN"
def main(): client_id = Utils.connectToVREP() # Define RL elements environment = StandingUpEnvironment(client_id) task = StandingUpTask(environment) controller = MyActionValueTable() learner = Q(0.5, 0.9) learner.explorer = EpsilonGreedyExplorer(0.15, 1) # EpsilonGreedyBoltzmannExplorer() agent = LearningAgent(controller, learner) experiment = EpisodicExperiment(task, agent) controller.initialize(agent) i = 0 try: while True: i += 1 print('Episode ' + str(i)) experiment.doEpisodes() agent.learn() agent.reset() print('mean: '+str(numpy.mean(controller.params))) print('max: '+str(numpy.max(controller.params))) print('min: '+str(numpy.min(controller.params))) if i % 500 == 0: # Save q-table every 500 episodes print('Save q-table') controller.save() task.t_table.save() except (KeyboardInterrupt, SystemExit): with open('../data/standing-up-q.pkl', 'wb') as handle: pickle.dump(controller.params, handle) task.t_table.save() controller.save() vrep.simxFinish(client_id)
self.env.reset() @property def indim(self): return self.env.indim @property def outdim(self): return self.env.outdim env = TetrisEnv(10,20) #Tetris task = TetrisTask(env) QNet = ActionValueNetwork(10*20+11, 6); learner = NFQ(); #Q()? learner._setExplorer(EpsilonGreedyExplorer(0.2,decay=0.99)) agent = LearningAgent(QNet,learner); experiment = EpisodicExperiment(task,agent) while True: experiment.doEpisodes(1) agent.learn() agent.reset() #or call more sporadically...? task.reset()
class SimulationMaster: def __init__(self, n_threads=4, initial_port=19997, q_table_version=0, batch_size=None, learner=None, explorer=None): self.barrier = Barrier(n_threads + 1, timeout=720) self.n_threads = n_threads self.initial_port = initial_port self.batch_size = batch_size self.controller = MyActionValueTable(q_table_version) if learner is None: self.learner = Q(0.5, 0.9) else: self.learner = learner if explorer is None: self.explorer = self.learner.explorer = EpsilonGreedyExplorer(0.2, 0.998) else: self.explorer = self.learner.explorer = explorer self.agent = LearningAgent(self.controller, self.learner) # Logger initialization self.logger = logging.getLogger('master_logger') self.logger.setLevel(logging.DEBUG) self.logger.addHandler(logging.FileHandler(Utils.DATA_PATH + 'learning-tables/master.log')) self.failed_simulations = [] self.n_episodes = 0 self.simulations = [] self.initialize_simulations() def initialize_simulations(self): self.simulations = [] for i in range(self.n_threads): if self.batch_size is not None: self.simulations.append(Simulation(self, self.initial_port + i, self.batch_size)) else: self.simulations.append(Simulation(self, self.initial_port + i)) def get_action(self, observation): action = self.controller.activate(observation) action = self.explorer.activate(observation, action) return action def add_observation(self, obs): """ Adds observation in the agent memory :param obs: 3 dimensional vector containing [observation, action, reward] """ self.agent.integrateObservation(obs[0]) self.agent.lastaction = obs[1] self.agent.giveReward(obs[2]) def update_q_table(self): """ Updates the q table with the new simulators observations """ for sim in self.simulations: for trace in sim.traces: for obs in trace: self.add_observation(obs) self.agent.learn() self.agent.reset() self.n_episodes += 1 sim.traces.clear() if self.explorer.epsilon > 0.1: self.explorer.epsilon=self.explorer.epsilon*self.explorer.decay if self.learner.alpha > 0.1: self.learner.alpha *= 0.999 self.logger.info('new epsilon: {}'.format(self.explorer.epsilon)) self.logger.info('new alpha: {}'.format(self.learner.alpha)) self.logger.info('n episodes: {}'.format(self.n_episodes)) def save_t_table(self): """ Saves t tables, one for each thread """ for sim in self.simulations: sim.save_t_table() def run(self): self.controller.initialize(self.agent) for sim in self.simulations: sim.start() counter = 0 while True: try: self.barrier.wait() # wait until all simulations are done self.update_q_table() self.save_t_table() self.barrier.wait() # Free simulations threads and start a new cycle # Counter to avoid to save q-table too often if counter == 5: self.controller.save() counter = 0 else: counter += 1 while self.failed_simulations: sim = self.failed_simulations.pop() self.restart_simulation(sim) except BrokenBarrierError as e: self.logger.error('Broken Barrier Error Occurred') for sim in self.simulations: sim.stop() for sim in self.simulations: sim.join() del self.simulations self.initialize_simulations() self.barrier.reset() self.failed_simulations.clear() for sim in self.simulations: sim.start() def restart_simulation(self, simulation): self.logger.info('Restarting simulation with port {}'.format(simulation.port)) self.simulations.remove(simulation) new_simulation = Simulation(self, simulation.port) self.simulations.append(new_simulation) new_simulation.start() del simulation
def Py_Brain(): ############################ # pybrain ############################ import matplotlib as mpl import matplotlib.pyplot as plt from matplotlib.colors import ListedColormap import itertools from scipy import linalg from pybrain.rl.environments.mazes import Maze, MDPMazeTask from pybrain.rl.learners.valuebased import ActionValueTable from pybrain.rl.agents import LearningAgent from pybrain.rl.learners import Q, SARSA from pybrain.rl.experiments import Experiment from pybrain.rl.environments import Task import pylab #pylab.gray() #pylab.ion() ''' structure = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 0, 0, 1, 0, 0, 0, 0, 1], [1, 0, 0, 1, 0, 0, 1, 0, 1], [1, 0, 0, 1, 0, 0, 1, 0, 1], [1, 0, 0, 1, 0, 1, 1, 0, 1], [1, 0, 0, 0, 0, 0, 1, 0, 1], [1, 1, 1, 1, 1, 1, 1, 0, 1], [1, 0, 0, 0, 0, 0, 0, 0, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1]]) ''' structure = np.array([[1, 1, 1, 1, 1], [1, 1, 0, 0, 1], [1, 1, 0, 1, 1], [1, 0, 0, 1, 1], [1, 1, 1, 1, 1]]) num_states = int(structure.shape[0]*structure.shape[1]) SQRT = int(math.sqrt(num_states)) #print structure.item((1, 3)) #environment = Maze(structure, (7, 7)) #second parameter is goal field tuple environment = Maze(structure, (1, 3)) #second parameter is goal field tuple print type(environment) print environment # Standard maze environment comes with the following 4 actions: # North, South, East, West controller = ActionValueTable(num_states, 4) #[N, S, E, W] controller.initialize(1) learner = Q() agent = LearningAgent(controller, learner) np.not_equal(agent.lastobs, None) task = MDPMazeTask(environment) experiment = Experiment(task, agent) #while True: for x in range(4): print x experiment.doInteractions(10) agent.learn() agent.reset() pylab.pcolor(controller.params.reshape(num_states,4).max(1).reshape(SQRT,SQRT)) pylab.draw() #pylab.show() name='MAZE' plt.savefig(str(name)+'_PLOT.png') plt.close()
def rl_optimizer(num_of_actions, params): # Defining UI environment actionmatrix = range(num_of_actions) goal = False # Default goal ui_env = UI(params.num_states, actionmatrix, num_of_actions, goal, params.sensor_errors, params.confusion_error, params.penalties, params.grid, params.dimensions, params.init_position, params.goal_position) av_table = ActionValueTable(params.num_states, num_of_actions, ui_env) av_table.initialize(1) # Train agent for each goal klm_tot = 0 klm_avg = 0 p_learned = 1 ############################################## # Define Q-learning agent learner = Q(0.5, 0.99) learner.explorer.epsilon = 0.7 learner.explorer.decay = 0.999 learner.explorer.env = ui_env agent = LearningAgent(av_table, learner) #Initialize av table. Give action matrix as an input. av_table.initialize(-5., actionmatrix) # Define task and experiment task = UITask(ui_env) experiment = EpisodicExperiment(task, agent, av_table) ############################################## # Training Agent for j in range(8): # Learning iterations runs = 50 # Episodes in one iteration experiment.doEpisodes(runs) agent.learn() agent.reset() ############################################## # Evaluation of UI and policy for current goal # Loop to get average : use only if errors used klm_tasks_tot = np.array([0.]*(params.num_states-1)) total_iterations = 1 klm_tot = 0 for i in range(total_iterations): # KLM value klm_g, best_path = evaluation(av_table, ui_env, task, False, params) if klm_g == -1: # Not learned klm_tot += 20*5 p_learned = 0 print "Policy not learned" break # Save to total KLM klm_tot += klm_g # Average KLM estimate klm_avg += klm_tot/total_iterations return best_path, klm_avg
# Without the next line, the pyplot plot won't actually show up. plt.pause(0.001) performance = [] if not render: pf_fig = plt.figure() while(True): # one learning step after one episode of world-interaction experiment.doEpisodes(1) agent.learn(1) # test performance (these real-world experiences are not used for training) if render: env.delay = True experiment.agent = testagent r = mean([sum(x) for x in experiment.doEpisodes(5)]) env.delay = False testagent.reset() experiment.agent = agent performance.append(r) if not render: plotPerformance(performance, pf_fig) print("reward avg", r) print("explorer epsilon", learner.explorer.epsilon) print("num episodes", agent.history.getNumSequences()) print("update step", len(performance))
plt.pause(0.001) performance = [] if not render: pf_fig = plt.figure() while (True): # one learning step after one episode of world-interaction experiment.doEpisodes(1) agent.learn(1) # test performance (these real-world experiences are not used for training) if render: env.delay = True experiment.agent = testagent r = mean([sum(x) for x in experiment.doEpisodes(5)]) env.delay = False testagent.reset() experiment.agent = agent performance.append(r) if not render: plotPerformance(performance, pf_fig) print("reward avg", r) print("explorer epsilon", learner.explorer.epsilon) print("num episodes", agent.history.getNumSequences()) print("update step", len(performance))
from pybrain.rl.experiments import Experiment envmatrix = array([[1,1,1,1,1,1,1,1,1], [1,0,0,1,0,0,0,0,1], [1,0,0,1,0,0,1,0,1], [1,0,0,1,0,0,1,0,1], [1,0,0,1,0,1,1,0,1], [1,0,0,0,0,0,1,0,1], [1,1,1,1,1,1,1,0,1], [1,0,0,0,0,0,0,0,1], [1,1,1,1,1,1,1,1,1]]) environment = Maze(envmatrix, (7,7)) task = MDPMazeTask(environment) table = ActionValueTable(81,4) table.initialize(1.) agent = LearningAgent(table,Q()) experiment = Experiment(task,agent) plt.ion() plt.gray() for i in range(1000): experiment.doInteractions(100); agent.learn(); agent.reset(); plt.pcolor(table.params.reshape(81,4).max(axis=1).reshape(9,9)) plt.gcf().canvas.draw()
# define action-value table # number of states is: # # current value: 1-21 # # number of actions: # # Stand=0, Hit=1 av_table = ActionValueTable(21, 2) av_table.initialize(0.) # define Q-learning agent learner = Q(0.5, 0.0) learner._setExplorer(EpsilonGreedyExplorer(0.0)) agent = LearningAgent(av_table, learner) # define the environment env = BlackjackEnv() # define the task task = BlackjackTask(env) # finally, define experiment experiment = Experiment(task, agent) # ready to go, start the process while True: experiment.doInteractions(1) agent.learn() agent.reset()
EpisodicTask.reset(self) self.env.reset() @property def indim(self): return self.env.indim @property def outdim(self): return self.env.outdim env = TetrisEnv(10, 20) #Tetris task = TetrisTask(env) QNet = ActionValueNetwork(10 * 20 + 11, 6) learner = NFQ() #Q()? learner._setExplorer(EpsilonGreedyExplorer(0.2, decay=0.99)) agent = LearningAgent(QNet, learner) experiment = EpisodicExperiment(task, agent) while True: experiment.doEpisodes(1) agent.learn() agent.reset() #or call more sporadically...? task.reset()
def rl_optimizer(UImatrix, actionmatrix, actions_in_uis, actions_penalty, num_of_actions, params): policies = [([])] * params.num_states # Defining UI environment goal = 1 # Default goal ui_env = UI(UImatrix, actionmatrix, actions_in_uis, actions_penalty, goal, params) av_table = ActionValueTable(params.num_states, num_of_actions) klm_tot = 0 klm_avg = 0 p_learned = 1 modality_table_total = np.array([0, 0, 0]) klm_total = 0 # Train agent for each goal for g in range(0, ui_env.num_of_states): ############################################## # Define Q-learning agent learner = Q(0.5, 0.9) learner.explorer.epsilon = 0.7 learner.explorer.decay = 0.999 learner.explorer.env = ui_env agent = LearningAgent(av_table, learner) # Define task and experiment task = UITask(ui_env) experiment = EpisodicExperiment(task, agent) # Initialze av table. Removes not allowed actions. av_table.initialize(1., actionmatrix) # Set goal experiment.task.env.setGoal(g) for j in range(50): initial_state = mod(j, ui_env.num_of_states) if initial_state == g: continue experiment.task.env.setInitialState(initial_state) runs = 50 experiment.doEpisodes(runs) agent.learn() agent.reset() ############################################## # Evaluation of UI and policy for the current goal # Iterate to get average - use only if errors used total_iterations = 10 klm_tot = 0 for i in range(total_iterations): # KLM value klm_g, modality_table = evaluation(av_table, ui_env, g, params) # Not learned if klm_g == -1: klm_tot += 20 * 5 p_learned = 0 return -1, 0, 0, 0, 0 # Save to total KLM klm_tot += klm_g / (params.num_states - 1) klm_avg += klm_tot / total_iterations modality_table_total += np.array(modality_table) klm_total += klm_avg if p_learned == 0: break return modality_table_total, klm_total
def run(arg): task = arg[0] parameters = arg[1] #print "run with", task,parameters seed = parameters["seed"] process_id = hash(multiprocessing.current_process()._identity) numpy.random.seed(seed) render = False plot = False plt.ion() env = CartPoleEnvironment() env.randomInitialization = False if render: renderer = CartPoleRenderer() env.setRenderer(renderer) renderer.start() task_class = getattr(cp, task) task = task_class(env, 50) #print "dim: ", task.indim, task.outdim # to inputs state and 4 actions bmodule = ActionValueRAND(task.outdim, task.indim) rlearner = RAND() blearner = RAND() # % of random actions bagent = LearningAgent(bmodule, rlearner) from pybrain.tools.shortcuts import buildNetwork from pybrain.rl.agents import OptimizationAgent from pybrain.optimization import PGPE module = buildNetwork(task.outdim, task.indim, bias=False) # create agent with controller and learner (and its options) # % of random actions #learner.explorer.epsilon = parameters["ExplorerEpsilon"] agent = OptimizationAgent(module, PGPE(storeAllEvaluations = True,storeAllEvaluated=True, maxEvaluations=None, verbose=False)) testagent = LearningAgent(module, None) pgpeexperiment = EpisodicExperiment(task, agent) randexperiment = EpisodicExperiment(task, bagent) def plotPerformance(values, fig): plt.figure(fig.number) plt.clf() plt.plot(values, 'o-') plt.gcf().canvas.draw() # Without the next line, the pyplot plot won't actually show up. plt.pause(0.001) performance = [] if plot: pf_fig = plt.figure() m = parameters["MaxTotalEpisodes"]/parameters["EpisodesPerLearn"] ## train pgpe for episode in range(0,50): # one learning step after one episode of world-interaction y =pgpeexperiment.doEpisodes(1) be, bf = agent.learner._bestFound() print be,bf print "generate data" be.numActions = 1 gdagent = LearningAgent(be, blearner) experiment = EpisodicExperiment(task, gdagent) for episode in range(0,1000): # print episode, " of 1000" # one learning step after one episode of world-interaction y =experiment.doEpisodes(1) # print y x = randexperiment.doEpisodes(1) # print len(y[0]) #renderer.drawPlot() # test performance (these real-world experiences are not used for training) if plot: env.delay = True l = 5 resList = (agent.learner._allEvaluations)[-l:-1] # print agent.learner._allEvaluations from scipy import array rLen = len(resList) avReward = array(resList).sum()/rLen # print avReward # print resList # exit(0) # print("Parameters:", agent.learner._bestFound()) # print( # " Evaluation:", episode, # " BestReward:", agent.learner.bestEvaluation, # " AverageReward:", avReward) # if agent.learner.bestEvaluation == 0: # # print resList[-20:-1] # print "done" # break #print resList performance.append(avReward) env.delay = False testagent.reset() #experiment.agent = agent # performance.append(r) if plot: plotPerformance(performance, pf_fig) # print "reward avg", r # print "explorer epsilon", learner.explorer.epsilon # print "num episodes", agent.history.getNumSequences() # print "update step", len(performance) blearner.add_ds(rlearner.dataset) blearner.learn() #blearner.learnX(agent.learner._allEvaluated) print "done" return performance
def run(arg): task = arg[0] parameters = arg[1] #print "run with", parameters seed = parameters["seed"] process_id = hash(multiprocessing.current_process()._identity) numpy.random.seed(seed + process_id) render = False plot = False plt.ion() env = CartPoleEnvironment() if render: renderer = CartPoleRenderer() env.setRenderer(renderer) renderer.start() task_class = getattr(cp, task) task = task_class(env, parameters["MaxRunsPerEpisode"]) testtask = task_class(env, parameters["MaxRunsPerEpisodeTest"],desiredValue=None) #print "dim: ", task.indim, task.outdim from pybrain.tools.shortcuts import buildNetwork from pybrain.rl.agents import OptimizationAgent from pybrain.optimization import PGPE module = buildNetwork(task.outdim, task.indim, bias=False) # create agent with controller and learner (and its options) # % of random actions #learner.explorer.epsilon = parameters["ExplorerEpsilon"] agent = OptimizationAgent(module, PGPE(storeAllEvaluations = True,storeAllEvaluated=False, maxEvaluations=None,desiredEvaluation=1, verbose=False)) # # print agent # from pprint import pprint # pprint (vars(agent.learner)) testagent = LearningAgent(module, None) experiment = EpisodicExperiment(task, agent) testexperiment = EpisodicExperiment(testtask, testagent) def plotPerformance(values, fig): plt.figure(fig.number) plt.clf() plt.plot(values, 'o-') plt.gcf().canvas.draw() # Without the next line, the pyplot plot won't actually show up. plt.pause(0.001) performance = [] if plot: pf_fig = plt.figure() m = parameters["MaxTotalEpisodes"]/parameters["EpisodesPerLearn"] for episode in range(0,m): # one learning step after one episode of world-interaction experiment.doEpisodes(parameters["EpisodesPerLearn"]) #agent.learn(1) #renderer.drawPlot() # test performance (these real-world experiences are not used for training) if plot: env.delay = True if (episode) % parameters["TestAfter"] == 0: #print "Evaluating at episode: ", episode #experiment.agent = testagent #r = mean([sum(x) for x in testexperiment.doEpisodes(parameters["TestWith"])]) #for i in range(0,parameters["TestWith"]): # y = testexperiment.doEpisodes(1) # print (agent.learner._allEvaluated) # # # from pprint import pprint # pprint (vars(task)) l = parameters["TestWith"] task.N = parameters["MaxRunsPerEpisodeTest"] experiment.doEpisodes(l) task.N = parameters["MaxRunsPerEpisode"] resList = (agent.learner._allEvaluations)[-l:-1] # print agent.learner._allEvaluations from scipy import array rLen = len(resList) avReward = array(resList).sum()/rLen # print avReward # print resList # exit(0) # print("Parameters:", agent.learner._bestFound()) # print( # " Evaluation:", episode, # " BestReward:", agent.learner.bestEvaluation, # " AverageReward:", avReward) # if agent.learner.bestEvaluation == 0: # # print resList[-20:-1] # print "done" # break performance.append(avReward) env.delay = False testagent.reset() #experiment.agent = agent # performance.append(r) if plot: plotPerformance(performance, pf_fig) # print "reward avg", r # print "explorer epsilon", learner.explorer.epsilon # print "num episodes", agent.history.getNumSequences() # print "update step", len(performance) # print "done" return performance #print "network", json.dumps(module.bn.net.E, indent=2) #import sumatra.parameters as p #import sys #parameter_file = sys.argv[1] #parameters = p.SimpleParameterSet(parameter_file) # # #run(["BalanceTask",parameters])
def rl_optimizer(UImatrix, buttonmatrix, num_of_actions, top_UI, params, batch_num, logging): # Defining UI environment goal = 1 # Default goal ui_env = UI(UImatrix, buttonmatrix, goal, params.sensor_errors, params.confusion_error, params.penalties) av_table = ActionValueTable(params.num_states, num_of_actions) # Check which actions are allowed actions_index = [[]] * params.num_states bad_actions_idx = [] for j in range(0, params.num_states): for i in range(0, params.num_states): if i == j: continue if UImatrix[j][i] == 1: actions_index[j] = actions_index[j] + [1, 1, 1] bad_actions_idx = bad_actions_idx + [0, 0, 0] else: actions_index[j] = actions_index[j] + [0, 0, 0] bad_actions_idx = bad_actions_idx + [1, 1, 1] ui_env.actions_index = actions_index bad_actions = np.nonzero(np.array(bad_actions_idx)) # Train agent for each goal klm_tot = 0 klm_avg = 0 policies = [] objective = -1 p_learned = 1 ii = 0 for g in range(0, ui_env.num_of_states): ########################## # Define Q-learning agent ########################## learner = Q(0.5, 0.9) learner.explorer.epsilon = 0.3 learner.explorer.decay = 0.999 learner.explorer.actions_index = actions_index learner.explorer.env = ui_env agent = LearningAgent(av_table, learner) ########################## # Define task and experiment ########################## task = UITask(ui_env) experiment = EpisodicExperiment(task, agent) # Set low values for not allowed actions bad_actions = np.ones([ui_env.num_of_states, ui_env.num_of_states]) for idx_state in range(ui_env.num_of_states): for idx_button in range(len(buttonmatrix[idx_state])): bad_actions[idx_state, buttonmatrix[idx_state]] = 0 bad_actions = np.reshape(bad_actions, ui_env.num_of_states * ui_env.num_of_states) bad_actions = np.nonzero(bad_actions) av_table.initialize(1., bad_actions[0]) #Removed bad actions # Initialize saved N av_tables convergence_N = 1 # Move to params av_tables_save = [] * convergence_N # Set goal experiment.task.env.setGoal(g) ########################## # Trainin agent ########################## # Add more iterations and runs if not learning. for j in range(10): initial_state = mod(j, ui_env.num_of_states) if initial_state == g: continue experiment.task.env.setInitialState(initial_state) runs = 15 experiment.doEpisodes(runs) agent.learn() agent.reset() ########################## # Save policy ########################## p = list(av_table.params) # Copies to new memory slot policies.append(p) ############################################## # Evaluation of UI and policy for current goal ############################################## # Loop to get average : use only if errors used klm_tasks_tot = np.array([0.] * (params.num_states - 1)) total_iterations = 15 klm_tot = 0 for i in range(total_iterations): # KLM value klm_g = evaluation(av_table, ui_env, g, params, batch_num, logging) if klm_g == -1: # Not learned klm_tot += 20 * 5 p_learned = 0 break # Save to total KLM klm_tot += klm_g / (params.num_states - 1) klm_avg += params.state_probs[g] * klm_tot / total_iterations if p_learned == 0: break if p_learned == 1: # Policy learned ########################## # Consistency ########################## consistency = 0 idx = 0 transitions_state = np.sum(UImatrix, 0) buttons_to_states = np.zeros( [params.num_states, params.num_states]) # Which buttons reach to the state goal for sr in range(params.num_states): for sc in range(params.num_states): if UImatrix[sr, sc] == 1: buttons_to_states[sc][buttonmatrix[sr][idx]] += 1 idx = idx + 1 idx = 0 for s in range(params.num_states): for act in range(params.num_states): if buttons_to_states[s][act] > 0: consistency += math.log(buttons_to_states[s][act] / transitions_state[s]) ########################## # Objective function ########################## objective = params.w_klm * klm_avg objective = objective - 1 * params.w_const * consistency objective = objective + params.w_simpl * math.log( np.sum(np.sum(UImatrix, 1))) objective_func = [ klm_avg, consistency, math.log(np.sum(np.sum(UImatrix, 1))) ] ########################## # Save the best ########################## top_UI.append([ UImatrix, buttonmatrix, policies, objective, objective_func, klm_avg ]) if len(top_UI) > params.top: top_UI = sorted(top_UI, key=op.itemgetter(3))[:params.top] return top_UI, objective
weeks = 52 * 2 days = 5 # number of samples per gradient estimate for week in range(weeks): all_rewards = experiment.doEpisodes(number=days) tot_reward = numpy.mean(agent.history.getSumOverSequences('reward')) # print learner._allEvaluations#[-:-1] # Plot the reward at each period averaged over the week. r = -1.0 * numpy.array(all_rewards).reshape(days, nf) avg_r = numpy.mean(r, 0) plot.setData(5, rday, avg_r) # Plot the set-point of each generator on the last day of the week. # FIXME: Plot the set-points averaged over the week. for i in range(len(case.online_generators)): scale_factor = 10 # plot.setData(i, rday, env._Pg[i, :] * scale_factor) plot.setData(i, rday, experiment.Pg[i, :] * scale_factor) agent.learn() agent.reset() # Scale sigma manually. sigma = [(sig * 0.95) - 0.05 for sig in sigma] learner.explorer.sigma = sigma plot.update() pylab.savefig("/tmp/rlopf.png")
def roundrobin(case, learners, profile, m, nb, ns, mx, weeks, days, outdir="/tmp", dc=True, trial=0): np = len(profile) adj = "dc" if dc else "ac" market = SmartMarket(case, priceCap=100.0, decommit=True, locationalAdjustment=adj) for i, perms in enumerate(itertools.permutations(learners)): experiment = MarketExperiment([], [], market, profile) for j, learner in enumerate(perms): gens = case.generators[j:j + 1] if isinstance(learner, ValueBasedLearner): # Comment out for stateful Roth-Erev learner. nstates = 1 if isinstance(learner, RothErev) else ns env = discrete.MarketEnvironment(gens, market, markups=m, numStates=nstates, numOffbids=nb) task = discrete.ProfitTask(env, maxSteps=np) na = len(env._allActions) module = ActionValueTable(numStates=nstates, numActions=na) elif isinstance(learner, DirectSearchLearner): env = continuous.MarketEnvironment(gens, market, nb, mx) task = continuous.ProfitTask(env, maxSteps=np) module = buildNetwork(env.outdim, 2, env.indim, bias=True, outputbias=True, hiddenclass=TanhLayer,outclass=TanhLayer) else: raise ValueError agent = LearningAgent(module, learner) experiment.tasks.append(task) experiment.agents.append(agent) all_states = zeros((3, 0)) all_actions = zeros((3, 0)) all_rewards = zeros((3, 0)) comments = ["Trial: %d, Perm: %d" % (trial, i)] for task, agent in zip(experiment.tasks, experiment.agents): g = task.env.generators[0] l = agent.learner.__class__.__name__ comments.append("(%s, %s)" % (g.name, l)) c = ", ".join(comments) for _ in range(weeks): experiment.doEpisodes(days) states = zeros((0, days * np)) actions = zeros((0, days * np)) rewards = zeros((0, days * np)) for _, agent in enumerate(experiment.agents): states = r_[states, agent.history["state"].T] actions = r_[actions, agent.history["action"].T] rewards = r_[rewards, agent.history["reward"].T] agent.learn() agent.reset() all_states = c_[all_states, states] all_actions = c_[all_actions, actions] all_rewards = c_[all_rewards, rewards] mmwrite(join(outdir, "state_%d_%d.mtx" % (trial, i)), all_states, c) mmwrite(join(outdir, "action_%d_%d.mtx" % (trial, i)), all_actions, c) mmwrite(join(outdir, "reward_%d_%d.mtx" % (trial, i)), all_rewards, c)
def run(arg): task = arg[0] parameters = arg[1] #print "run with", parameters seed = parameters["seed"] process_id = hash(multiprocessing.current_process()._identity) numpy.random.seed(seed + process_id) render = False plot = False plt.ion() env = CartPoleEnvironment() if render: renderer = CartPoleRenderer() env.setRenderer(renderer) renderer.start() task_class = getattr(cp, task) task = task_class(env, parameters["MaxRunsPerEpisode"]) testtask = task_class(env, parameters["MaxRunsPerEpisodeTest"]) #print "dim: ", task.indim, task.outdim # to inputs state and 4 actions module = ActionValueNetwork(task.outdim, task.indim) learner = NFQ() # % of random actions learner.explorer.epsilon = parameters["ExplorerEpsilon"] agent = LearningAgent(module, learner) testagent = LearningAgent(module, None) experiment = EpisodicExperiment(task, agent) testexperiment = EpisodicExperiment(testtask, testagent) def plotPerformance(values, fig): plt.figure(fig.number) plt.clf() plt.plot(values, 'o-') plt.gcf().canvas.draw() # Without the next line, the pyplot plot won't actually show up. plt.pause(0.001) performance = [] if plot: pf_fig = plt.figure() m = parameters["MaxTotalEpisodes"]/parameters["EpisodesPerLearn"] for episode in range(0,m): # one learning step after one episode of world-interaction experiment.doEpisodes(parameters["EpisodesPerLearn"]) agent.learn(1) #renderer.drawPlot() # test performance (these real-world experiences are not used for training) if plot: env.delay = True if (episode) % parameters["TestAfter"] == 0: #print "Evaluating at episode: ", episode #experiment.agent = testagent r = mean([sum(x) for x in testexperiment.doEpisodes(parameters["TestWith"])]) env.delay = False testagent.reset() #experiment.agent = agent performance.append(r) if plot: plotPerformance(performance, pf_fig) # print "reward avg", r # print "explorer epsilon", learner.explorer.epsilon # print "num episodes", agent.history.getNumSequences() # print "update step", len(performance) # print "done" return performance #print "network", json.dumps(module.bn.net.E, indent=2)
def rl_optimizer(UImatrix, num_of_actions, top_UI, params, batch_num, logging): #global policies #global top_UI policies = [([])] * params.num_states num_states = params.num_states # Defining UI environment actionmatrix = range(num_of_actions) goal = False # Default goal ui_env = UI(num_states, actionmatrix, num_of_actions, goal, params.sensor_errors, params.confusion_error, params.penalties, params.grid, params.dimensions, params.init_position, params.goal_position) av_table = ActionValueTable(num_states, num_of_actions, ui_env) av_table.initialize(1) # Train agent for each goal klm_tot = 0 klm_avg = 0 policies = [] best_actions = [] # [0]*ui_env.num_of_states*(ui_env.num_of_states-1) objective = -1 p_learned = 1 ii = 0 ######### # Define Q-learning agent learner = Q(0.5, 0.99) #Q(0.6, 0.99) # 0.5, 0.99 learner.explorer.epsilon = 0.7 # 0.7 # 0.9 learner.explorer.decay = 0.999 # 0.99 learner.explorer.env = ui_env agent = LearningAgent(av_table, learner) # Define task and experiment task = UITask(ui_env) experiment = EpisodicExperiment(task, agent, av_table) ####### #Removed bad actions, give action matric as input av_table.initialize(-5., actionmatrix) for j in range(8): # Learning iterations initial_state = 0 runs = 50 # Episodes in one iteration experiment.doEpisodes(runs) agent.learn() agent.reset() ############################################## # Save policy # For optimization p = list(av_table.params) # Copies to new memory slot policies.append(p) ############################################## # Evaluation of UI and policy for current goal # Loop to get average : use only if errors used klm_tasks_tot = np.array([0.] * (params.num_states - 1)) total_iterations = 1 klm_tot = 0 for i in range(total_iterations): # KLM value klm_g, best_path = evaluation(av_table, ui_env, task, False, params, batch_num, logging) if klm_g == -1: # Not learned klm_tot += 20 * 5 p_learned = 0 print "error" break # Save to total KLM klm_tot += klm_g klm_avg += klm_tot / total_iterations return top_UI, objective, best_actions, best_path, klm_g