def main(): # if os.path.exists('./agent.dump'): # with open('./agent.dump') as f: # agent = pickle.load(f) # else: controller = ActionValueNetwork(9, 4) learner = NFQ() agent = LearningAgent(controller, learner) score_list = [] for i in range(10000): score = play(agent) score_list.append(score) # ここで, # TypeError: only length-1 arrays can be converted to Python scalars # pybrain/rl/learners/valuebased/q.py # => learnerをQからNFQにしたら行けた. # => http://stackoverflow.com/questions/23755927/pybrain-training-a-actionvaluenetwork-doesnt-properly-work #agent.learn() agent.reset() #data =[[0,0,0,0], [0,0,0,0], [0,0,0,2], [0,0,0,2]] data =[[0,0,2], [0,0,0], [0,0,2]] agent.integrateObservation(numpy.array(data).ravel()) move = agent.getAction() print i, int(numpy.mean(score_list)) , max(score_list), move with open('./agent.dump', 'w') as f: pickle.dump(agent, f) with open('./score.dump', 'w') as f: pickle.dump(score_list, f)
class Team(object): def __init__(self, living, task, learner = ENAC()): self.living = living self.task = task self.last_reward = 0 self.agent = LearningAgent(self.living.brain, learner) self.oldparams = self.living.brain.params def Interaction(self): self.agent.integrateObservation(self.task.getObservation()) self.task.performAction(self.agent.getAction()) self.last_reward = self.task.getReward() self.agent.giveReward(self.last_reward) finished = self.task.isFinished() if finished: #print task.cumreward self.agent.newEpisode() self.task.reset() return self.last_reward, finished def Learn(self, episodes = 1): self.agent.learn(episodes) self.agent.reset() newparams = self.living.brain.params.copy() #get_all_weights(eater.brain)[:] dif = 0 j = 0 for i in newparams: dif += (self.oldparams[j] - newparams[j])**2 j += 1 self.oldparams = newparams return dif
def main(): # 2048の全ての状態を保存するのは無理でしょ. # 14^16通りの状態があるよね. #controller = ActionValueTable(16, 4) #learner = Q() #controller.initialize(1.) controller = ActionValueNetwork(16, 4) learner = NFQ() #learner._setExplorer(EpsilonGreedyExplorer(0.0)) agent = LearningAgent(controller, learner) score_list = [] for i in range(10000): # if os.path.exists('./agent.dump'): # with open('./agent.dump') as f: # agent = pickle.load(f) print i, 'playing ...' score = play(agent) score_list.append(score) # ここで, # TypeError: only length-1 arrays can be converted to Python scalars # pybrain/rl/learners/valuebased/q.py # => learnerをQからNFQにしたら行けた. # => http://stackoverflow.com/questions/23755927/pybrain-training-a-actionvaluenetwork-doesnt-properly-work print i, 'learning ...' agent.learn() agent.reset() print i, 'evaluate sample ...' data = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 2], [0, 0, 0, 2]] agent.integrateObservation(numpy.array(data).ravel()) move = agent.getAction() print " ", i, int( numpy.mean(score_list)), max(score_list), move if i % 20 == 0: print i, 'saving ...' with open('./agent.dump', 'w') as f: pickle.dump(agent, f) with open('./score.dump', 'w') as f: pickle.dump(score_list, f)
def main(): # 2048の全ての状態を保存するのは無理でしょ. # 14^16通りの状態があるよね. #controller = ActionValueTable(16, 4) #learner = Q() #controller.initialize(1.) controller = ActionValueNetwork(16, 4) learner = NFQ() #learner._setExplorer(EpsilonGreedyExplorer(0.0)) agent = LearningAgent(controller, learner) score_list = [] for i in range(10000): # if os.path.exists('./agent.dump'): # with open('./agent.dump') as f: # agent = pickle.load(f) print i, 'playing ...' score = play(agent) score_list.append(score) # ここで, # TypeError: only length-1 arrays can be converted to Python scalars # pybrain/rl/learners/valuebased/q.py # => learnerをQからNFQにしたら行けた. # => http://stackoverflow.com/questions/23755927/pybrain-training-a-actionvaluenetwork-doesnt-properly-work print i, 'learning ...' agent.learn() agent.reset() print i, 'evaluate sample ...' data =[[0,0,0,0], [0,0,0,0], [0,0,0,2], [0,0,0,2]] agent.integrateObservation(numpy.array(data).ravel()) move = agent.getAction() print " ",i, int(numpy.mean(score_list)) , max(score_list), move if i % 20 == 0: print i, 'saving ...' with open('./agent.dump', 'w') as f: pickle.dump(agent, f) with open('./score.dump', 'w') as f: pickle.dump(score_list, f)
Program: NFQ_EXAMPLE.PY Date: Thursday, March 1 2012 Description: Test NFQ on my cartpole simulation. """ from pybrain.rl.agents import LearningAgent from pybrain.rl.learners.valuebased import NFQ, ActionValueNetwork from cartpole import CartPole import numpy as np module = ActionValueNetwork(4,2) learner = NFQ() learner.explorer.epsilon = 0.4 agent = LearningAgent(module, learner) env = CartPole() cnt = 0 for i in range(1000): env.reset() print "Episode: %d, Count: %d" % (i,cnt) cnt = 0 while not env.failure(): agent.integrateObservation(env.observation()) action = agent.getAction() pstate, paction, reward, state = env.move(action) cnt += 1 agent.giveReward(reward) agent.learn(1)
# The parameters of your algorithm av_table = ActionValueTable(4, 2) av_table.initialize(0.) # For Action Value Table learner = Q(0.5, 0.0) # define Q-learning agent learner._setExplorer(EpsilonGreedyExplorer(0.0)) agent = LearningAgent(av_table, learner) for x in xrange(1,100): # The training listxor = random.choice([[0, 0],[0, 1], [1, 0], [1, 1]]) qstate = listxor[0] + listxor[1]*2 resultxor = listxor[0]^listxor[1] agent.integrateObservation([qstate]) action = agent.getAction() if int(action) == resultxor: reward = 1 else: reward = -1 print "xor(",listxor,") = ", resultxor, " || action = " , action[0], "reward = ", reward agent.giveReward(reward) # 1 for good answer, 0 for bad. agent.learn() print "finished"
class SimulationMaster: def __init__(self, n_threads=4, initial_port=19997, q_table_version=0, batch_size=None, learner=None, explorer=None): self.barrier = Barrier(n_threads + 1, timeout=720) self.n_threads = n_threads self.initial_port = initial_port self.batch_size = batch_size self.controller = MyActionValueTable(q_table_version) if learner is None: self.learner = Q(0.5, 0.9) else: self.learner = learner if explorer is None: self.explorer = self.learner.explorer = EpsilonGreedyExplorer(0.2, 0.998) else: self.explorer = self.learner.explorer = explorer self.agent = LearningAgent(self.controller, self.learner) # Logger initialization self.logger = logging.getLogger('master_logger') self.logger.setLevel(logging.DEBUG) self.logger.addHandler(logging.FileHandler(Utils.DATA_PATH + 'learning-tables/master.log')) self.failed_simulations = [] self.n_episodes = 0 self.simulations = [] self.initialize_simulations() def initialize_simulations(self): self.simulations = [] for i in range(self.n_threads): if self.batch_size is not None: self.simulations.append(Simulation(self, self.initial_port + i, self.batch_size)) else: self.simulations.append(Simulation(self, self.initial_port + i)) def get_action(self, observation): action = self.controller.activate(observation) action = self.explorer.activate(observation, action) return action def add_observation(self, obs): """ Adds observation in the agent memory :param obs: 3 dimensional vector containing [observation, action, reward] """ self.agent.integrateObservation(obs[0]) self.agent.lastaction = obs[1] self.agent.giveReward(obs[2]) def update_q_table(self): """ Updates the q table with the new simulators observations """ for sim in self.simulations: for trace in sim.traces: for obs in trace: self.add_observation(obs) self.agent.learn() self.agent.reset() self.n_episodes += 1 sim.traces.clear() if self.explorer.epsilon > 0.1: self.explorer.epsilon=self.explorer.epsilon*self.explorer.decay if self.learner.alpha > 0.1: self.learner.alpha *= 0.999 self.logger.info('new epsilon: {}'.format(self.explorer.epsilon)) self.logger.info('new alpha: {}'.format(self.learner.alpha)) self.logger.info('n episodes: {}'.format(self.n_episodes)) def save_t_table(self): """ Saves t tables, one for each thread """ for sim in self.simulations: sim.save_t_table() def run(self): self.controller.initialize(self.agent) for sim in self.simulations: sim.start() counter = 0 while True: try: self.barrier.wait() # wait until all simulations are done self.update_q_table() self.save_t_table() self.barrier.wait() # Free simulations threads and start a new cycle # Counter to avoid to save q-table too often if counter == 5: self.controller.save() counter = 0 else: counter += 1 while self.failed_simulations: sim = self.failed_simulations.pop() self.restart_simulation(sim) except BrokenBarrierError as e: self.logger.error('Broken Barrier Error Occurred') for sim in self.simulations: sim.stop() for sim in self.simulations: sim.join() del self.simulations self.initialize_simulations() self.barrier.reset() self.failed_simulations.clear() for sim in self.simulations: sim.start() def restart_simulation(self, simulation): self.logger.info('Restarting simulation with port {}'.format(simulation.port)) self.simulations.remove(simulation) new_simulation = Simulation(self, simulation.port) self.simulations.append(new_simulation) new_simulation.start() del simulation