# Without the next line, the pyplot plot won't actually show up. plt.pause(0.001) performance = [] if not render: pf_fig = plt.figure() while(True): # one learning step after one episode of world-interaction experiment.doEpisodes(1) agent.learn(1) # test performance (these real-world experiences are not used for training) if render: env.delay = True experiment.agent = testagent r = mean([sum(x) for x in experiment.doEpisodes(5)]) env.delay = False testagent.reset() experiment.agent = agent performance.append(r) if not render: plotPerformance(performance, pf_fig) print("reward avg", r) print("explorer epsilon", learner.explorer.epsilon) print("num episodes", agent.history.getNumSequences()) print("update step", len(performance))
plt.pause(0.001) performance = [] if not render: pf_fig = plt.figure() while (True): # one learning step after one episode of world-interaction experiment.doEpisodes(1) agent.learn(1) # test performance (these real-world experiences are not used for training) if render: env.delay = True experiment.agent = testagent r = mean([sum(x) for x in experiment.doEpisodes(5)]) env.delay = False testagent.reset() experiment.agent = agent performance.append(r) if not render: plotPerformance(performance, pf_fig) print("reward avg", r) print("explorer epsilon", learner.explorer.epsilon) print("num episodes", agent.history.getNumSequences()) print("update step", len(performance))
perform_cumrewards = [] for irehearsal in range(7000): # Learn. # ------ r = exp.doEpisodes(1) # Discounted reward. cumreward = exp.task.getTotalReward() # print 'cumreward: %.4f; nsteps: %i; learningRate: %.4f' % ( # cumreward, len(r[0]), exp.agent.learner.learningRate) if irehearsal % 50 == 0: # Perform (no learning). # ---------------------- # Swap out the agent. exp.agent = performance_agent # Perform. r = exp.doEpisodes(1) perform_cumreward = task.getTotalReward() perform_cumrewards.append(perform_cumreward) print "PERFORMANCE: cumreward:", perform_cumreward, "nsteps:", len(r[0]) # Swap back the learning agent. performance_agent.reset() exp.agent = agent ax1.cla() ax1.plot(perform_cumrewards, ".--") # Wheel trajectories. update_wheel_trajectories()
perform_cumrewards = [] for irehearsal in range(7000): # Learn. # ------ r = exp.doEpisodes(1) # Discounted reward. cumreward = exp.task.getTotalReward() #print 'cumreward: %.4f; nsteps: %i; learningRate: %.4f' % ( # cumreward, len(r[0]), exp.agent.learner.learningRate) if irehearsal % 50 == 0: # Perform (no learning). # ---------------------- # Swap out the agent. exp.agent = performance_agent # Perform. r = exp.doEpisodes(1) perform_cumreward = task.getTotalReward() perform_cumrewards.append(perform_cumreward) print('PERFORMANCE: cumreward:', perform_cumreward, 'nsteps:', len(r[0])) # Swap back the learning agent. performance_agent.reset() exp.agent = agent ax1.cla() ax1.plot(perform_cumrewards, '.--') # Wheel trajectories.
def run(nao,pad): # ################################ # choose bottom cam, so nao can see object when standing next to it nao.camera.selectCam(1) env = grabbingEnvironment(nao) #env.connect(nao) task = grabbingTask(env) net = buildNetwork(len(task.getObservation()),8, env.indim, bias = True, recurrent=True) print env.indim #net = ActionValueNetwork(5,4) #, outclass=TanhLayer) #, hiddenclass=TanhLayer, outclass=TanhLayer # not correct right now.. # TODO: train into RL Modules, dataset needs to be merged with exploration data #generateTraining.generateTraining().runDeltaMovements(nao,net,env,pad) #module = ActionValueNetwork(3, 3) #module = NeuronLayer(40) #agent = LearningAgent(net, SARSA()) #learner = PolicyGradientLearner() #learner._setExplorer(StateDependentExplorer(3,3)) #learner._setModule(module) #agent = LearningAgent(module, learner) #agent = LearningAgent(net, ENAC()) #agent = LearningAgent(net, Reinforce()) #learner = NFQ() #learner.explorer.epsilon = 0.4 #agent = LearningAgent(net, learner) testagent = OptimizationAgent(net,None,env) #agent = LearningAgent(module, Q()) #agent = LearningAgent(module, QLambda()) learner = grabbingPGPE(storeAllEvaluations = True, verbose = True, epsilon = 1.0, deltamax =5.0, sigmaLearningRate = 0.1, learningRate = 0.2) agent = OptimizationAgent(net, learner,env) #agent = OptimizationAgent(net, SimpleSPSA(storeAllEvaluations = True, verbose = True)) #agent = OptimizationAgent(net, HillClimber(storeAllEvaluations = True, verbose = True)) #agent = OptimizationAgent(net, RandomSearch(storeAllEvaluations = True, verbose = True)) experiment = EpisodicExperiment(task, agent) # only for optimizationAgent #experiment.doOptimization = True # only for simulator! nao.fractionMaxSpeed = 1.0 print "#env" print " sensors:", env.outdim print " actions:", env.indim print " discreteStates:", env.discreteStates print " discreteActions:", env.discreteActions print print "#task" print " sensor_limits:", task.sensor_limits print " actor_limits:", task.actor_limits print " epilen: ", task.epiLen print "#EpisodicTask" print " discount:", task.discount print " batchsize:", task.batchSize print print "#PGPE" print " exploration type:", grabbingPGPE().exploration print " LearningRate:", grabbingPGPE().learningRate print " sigmaLearningRate:", grabbingPGPE().sigmaLearningRate print " epsilon:", grabbingPGPE().epsilon print " wDecay:", grabbingPGPE().wDecay print " momentum:", grabbingPGPE().momentum print " rprop:", grabbingPGPE().rprop # # switch this to True if you want to see the cart balancing the pole (slower) # render = False # # plt.ion() # # env = CartPoleEnvironment() # if render: # renderer = CartPoleRenderer() # env.setRenderer(renderer) # renderer.start() # # module = ActionValueNetwork(4, 3) # # task = DiscreteBalanceTask(env, 100) # learner = NFQ() # learner.explorer.epsilon = 0.4 # # agent = LearningAgent(module, learner) # testagent = LearningAgent(module, None) # experiment = EpisodicExperiment(task, agent) # # performance = [] # # if not render: # pf_fig = plt.figure() count = 0 while(True): # one learning step after one episode of world-interaction count += 1 print "learning #",count experiment.agent = agent experiment.doOptimization = True erg = experiment.doEpisodes(1) print erg #experiment.doOptimization = False #print "agent learn" #agent.learner.learn(1) if count > 8: # test performance (these real-world experiences are not used for training) # if render: # env.delay = True #experiment.agent = testagent print "testing" experiment.doOptimization = False erg = experiment.doEpisodes(1) summe = 0 #print erg # for x in erg: # summe = sum(x) # print summe #r = mean([sum(x) for x in experiment.doEpisodes(5)]) # env.delay = False # testagent.reset() # performance.append(r) # if not render: # plotPerformance(performance, pf_fig) # print "reward avg", r # print "explorer epsilon", learner.explorer.epsilon # print "num episodes", agent.history.getNumSequences() # print "update step", len(performance) # #for updates in range(5000000) # updates = 0 # episodes = 10 # while True: # updates += 1 # #raw_input("next episode") # print "lerne episode:",updates # experiment.doEpisodes(episodes) ## print "lernen beendet, starte testlauf" # env.reset() ## if updates > 0: ## experiment.doInteractions(20) ## rewsum = 0 ## rewlist = [] ## for i in range (0,100): ## rew = task.performAction(net.activate(task.getObservation())) ## ## task.getObservation() ## rewlist.append(rew) ## rewsum += rew # #print " testlauf: ",updates,"aktion: ",i+1," reward: ",rew ## print "-> summe = ",rewsum, " avg: ",rewsum / 100.0 # #print "episodes:",updates," rewsum: ",rewsum," testrewards:",rewlist ## #x = "episode:" + updates + " testrewards:" + rewlist ## #o.write(x) ## for i in range(0,len(rewlist)): ## x = (updates % 20) - 10 ## y = i - 10 ## z = rewlist[i] ## #g.plot((x,y,z),x=1, y=2, z=3) # # #g.doplot() # # # # #print "-------------------------------------------------------------------" print "finished grabbingTest"
def update_wheel_trajectories(): front_lines = ax2.plot(task.env.get_xfhist(), task.env.get_yfhist(), 'r') back_lines = ax2.plot(task.env.get_xbhist(), task.env.get_ybhist(), 'b') plt.axis('equal') perform_cumrewards = [] for iteration in range(100000): #print("ITERATION : " , iteration) r = exp.doEpisodes(1) cumreward = exp.task.getTotalReward() #print 'cumreward: %.4f; nsteps: %i; learningRate: %.4f' % (cumreward, len(r[0]), exp.agent.learner.learningRate) if iteration % 15 == 0: exp.agent = max_agent r = exp.doEpisodes(1) perform_cumreward = task.getTotalReward() perform_cumrewards.append(perform_cumreward) print('PERFORMANCE: cumreward:', perform_cumreward, 'nsteps:', len(r[0])) stats = (task.env.get_yfhist()) new_stats = [ np.max(stats), np.mean(stats), np.median(stats), stats[-1], perform_cumreward, iteration, exp.agent.learner.learningRate ] new_stats = [str(s) for s in new_stats] with open("res/lspi_30-50.txt", "a") as myfile: myfile.write(" ".join(new_stats) + "\n")