def getAction(self): #pega acao com Boltzmann ou Q-Learning if(self.nextAction == None): action = LearningAgent.getAction(self) self.lastaction = action return action else: #indicacao do supervisor com tolerancia if(self.tolerance != None): if( (self.expectedReward * (1 + self.tolerance)) > self.module.getActionValue(self.nextAction)): action = self.nextAction self.lastaction = action self.nextAction = None return action else: #acao independente action = LearningAgent.getAction(self) self.lastaction = action return action #indicacao do supervisor sem tolerancia else: action = self.nextAction self.lastaction = action self.nextAction = None return action
def main(): # if os.path.exists('./agent.dump'): # with open('./agent.dump') as f: # agent = pickle.load(f) # else: controller = ActionValueNetwork(9, 4) learner = NFQ() agent = LearningAgent(controller, learner) score_list = [] for i in range(10000): score = play(agent) score_list.append(score) # ここで, # TypeError: only length-1 arrays can be converted to Python scalars # pybrain/rl/learners/valuebased/q.py # => learnerをQからNFQにしたら行けた. # => http://stackoverflow.com/questions/23755927/pybrain-training-a-actionvaluenetwork-doesnt-properly-work #agent.learn() agent.reset() #data =[[0,0,0,0], [0,0,0,0], [0,0,0,2], [0,0,0,2]] data =[[0,0,2], [0,0,0], [0,0,2]] agent.integrateObservation(numpy.array(data).ravel()) move = agent.getAction() print i, int(numpy.mean(score_list)) , max(score_list), move with open('./agent.dump', 'w') as f: pickle.dump(agent, f) with open('./score.dump', 'w') as f: pickle.dump(score_list, f)
class Team(object): def __init__(self, living, task, learner = ENAC()): self.living = living self.task = task self.last_reward = 0 self.agent = LearningAgent(self.living.brain, learner) self.oldparams = self.living.brain.params def Interaction(self): self.agent.integrateObservation(self.task.getObservation()) self.task.performAction(self.agent.getAction()) self.last_reward = self.task.getReward() self.agent.giveReward(self.last_reward) finished = self.task.isFinished() if finished: #print task.cumreward self.agent.newEpisode() self.task.reset() return self.last_reward, finished def Learn(self, episodes = 1): self.agent.learn(episodes) self.agent.reset() newparams = self.living.brain.params.copy() #get_all_weights(eater.brain)[:] dif = 0 j = 0 for i in newparams: dif += (self.oldparams[j] - newparams[j])**2 j += 1 self.oldparams = newparams return dif
def main(): # 2048の全ての状態を保存するのは無理でしょ. # 14^16通りの状態があるよね. #controller = ActionValueTable(16, 4) #learner = Q() #controller.initialize(1.) controller = ActionValueNetwork(16, 4) learner = NFQ() #learner._setExplorer(EpsilonGreedyExplorer(0.0)) agent = LearningAgent(controller, learner) score_list = [] for i in range(10000): # if os.path.exists('./agent.dump'): # with open('./agent.dump') as f: # agent = pickle.load(f) print i, 'playing ...' score = play(agent) score_list.append(score) # ここで, # TypeError: only length-1 arrays can be converted to Python scalars # pybrain/rl/learners/valuebased/q.py # => learnerをQからNFQにしたら行けた. # => http://stackoverflow.com/questions/23755927/pybrain-training-a-actionvaluenetwork-doesnt-properly-work print i, 'learning ...' agent.learn() agent.reset() print i, 'evaluate sample ...' data = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 2], [0, 0, 0, 2]] agent.integrateObservation(numpy.array(data).ravel()) move = agent.getAction() print " ", i, int( numpy.mean(score_list)), max(score_list), move if i % 20 == 0: print i, 'saving ...' with open('./agent.dump', 'w') as f: pickle.dump(agent, f) with open('./score.dump', 'w') as f: pickle.dump(score_list, f)
def main(): # 2048の全ての状態を保存するのは無理でしょ. # 14^16通りの状態があるよね. #controller = ActionValueTable(16, 4) #learner = Q() #controller.initialize(1.) controller = ActionValueNetwork(16, 4) learner = NFQ() #learner._setExplorer(EpsilonGreedyExplorer(0.0)) agent = LearningAgent(controller, learner) score_list = [] for i in range(10000): # if os.path.exists('./agent.dump'): # with open('./agent.dump') as f: # agent = pickle.load(f) print i, 'playing ...' score = play(agent) score_list.append(score) # ここで, # TypeError: only length-1 arrays can be converted to Python scalars # pybrain/rl/learners/valuebased/q.py # => learnerをQからNFQにしたら行けた. # => http://stackoverflow.com/questions/23755927/pybrain-training-a-actionvaluenetwork-doesnt-properly-work print i, 'learning ...' agent.learn() agent.reset() print i, 'evaluate sample ...' data =[[0,0,0,0], [0,0,0,0], [0,0,0,2], [0,0,0,2]] agent.integrateObservation(numpy.array(data).ravel()) move = agent.getAction() print " ",i, int(numpy.mean(score_list)) , max(score_list), move if i % 20 == 0: print i, 'saving ...' with open('./agent.dump', 'w') as f: pickle.dump(agent, f) with open('./score.dump', 'w') as f: pickle.dump(score_list, f)
Program: NFQ_EXAMPLE.PY Date: Thursday, March 1 2012 Description: Test NFQ on my cartpole simulation. """ from pybrain.rl.agents import LearningAgent from pybrain.rl.learners.valuebased import NFQ, ActionValueNetwork from cartpole import CartPole import numpy as np module = ActionValueNetwork(4,2) learner = NFQ() learner.explorer.epsilon = 0.4 agent = LearningAgent(module, learner) env = CartPole() cnt = 0 for i in range(1000): env.reset() print "Episode: %d, Count: %d" % (i,cnt) cnt = 0 while not env.failure(): agent.integrateObservation(env.observation()) action = agent.getAction() pstate, paction, reward, state = env.move(action) cnt += 1 agent.giveReward(reward) agent.learn(1)