def run(nao,pad): # ################################ # choose bottom cam, so nao can see object when standing next to it nao.camera.selectCam(1) env = grabbingEnvironment(nao) #env.connect(nao) task = grabbingTask(env) net = buildNetwork(len(task.getObservation()),8, env.indim, bias = True, recurrent=True) print env.indim #net = ActionValueNetwork(5,4) #, outclass=TanhLayer) #, hiddenclass=TanhLayer, outclass=TanhLayer # not correct right now.. # TODO: train into RL Modules, dataset needs to be merged with exploration data #generateTraining.generateTraining().runDeltaMovements(nao,net,env,pad) #module = ActionValueNetwork(3, 3) #module = NeuronLayer(40) #agent = LearningAgent(net, SARSA()) #learner = PolicyGradientLearner() #learner._setExplorer(StateDependentExplorer(3,3)) #learner._setModule(module) #agent = LearningAgent(module, learner) #agent = LearningAgent(net, ENAC()) #agent = LearningAgent(net, Reinforce()) #learner = NFQ() #learner.explorer.epsilon = 0.4 #agent = LearningAgent(net, learner) testagent = OptimizationAgent(net,None,env) #agent = LearningAgent(module, Q()) #agent = LearningAgent(module, QLambda()) learner = grabbingPGPE(storeAllEvaluations = True, verbose = True, epsilon = 1.0, deltamax =5.0, sigmaLearningRate = 0.1, learningRate = 0.2) agent = OptimizationAgent(net, learner,env) #agent = OptimizationAgent(net, SimpleSPSA(storeAllEvaluations = True, verbose = True)) #agent = OptimizationAgent(net, HillClimber(storeAllEvaluations = True, verbose = True)) #agent = OptimizationAgent(net, RandomSearch(storeAllEvaluations = True, verbose = True)) experiment = EpisodicExperiment(task, agent) # only for optimizationAgent #experiment.doOptimization = True # only for simulator! nao.fractionMaxSpeed = 1.0 print "#env" print " sensors:", env.outdim print " actions:", env.indim print " discreteStates:", env.discreteStates print " discreteActions:", env.discreteActions print print "#task" print " sensor_limits:", task.sensor_limits print " actor_limits:", task.actor_limits print " epilen: ", task.epiLen print "#EpisodicTask" print " discount:", task.discount print " batchsize:", task.batchSize print print "#PGPE" print " exploration type:", grabbingPGPE().exploration print " LearningRate:", grabbingPGPE().learningRate print " sigmaLearningRate:", grabbingPGPE().sigmaLearningRate print " epsilon:", grabbingPGPE().epsilon print " wDecay:", grabbingPGPE().wDecay print " momentum:", grabbingPGPE().momentum print " rprop:", grabbingPGPE().rprop # # switch this to True if you want to see the cart balancing the pole (slower) # render = False # # plt.ion() # # env = CartPoleEnvironment() # if render: # renderer = CartPoleRenderer() # env.setRenderer(renderer) # renderer.start() # # module = ActionValueNetwork(4, 3) # # task = DiscreteBalanceTask(env, 100) # learner = NFQ() # learner.explorer.epsilon = 0.4 # # agent = LearningAgent(module, learner) # testagent = LearningAgent(module, None) # experiment = EpisodicExperiment(task, agent) # # performance = [] # # if not render: # pf_fig = plt.figure() count = 0 while(True): # one learning step after one episode of world-interaction count += 1 print "learning #",count experiment.agent = agent experiment.doOptimization = True erg = experiment.doEpisodes(1) print erg #experiment.doOptimization = False #print "agent learn" #agent.learner.learn(1) if count > 8: # test performance (these real-world experiences are not used for training) # if render: # env.delay = True #experiment.agent = testagent print "testing" experiment.doOptimization = False erg = experiment.doEpisodes(1) summe = 0 #print erg # for x in erg: # summe = sum(x) # print summe #r = mean([sum(x) for x in experiment.doEpisodes(5)]) # env.delay = False # testagent.reset() # performance.append(r) # if not render: # plotPerformance(performance, pf_fig) # print "reward avg", r # print "explorer epsilon", learner.explorer.epsilon # print "num episodes", agent.history.getNumSequences() # print "update step", len(performance) # #for updates in range(5000000) # updates = 0 # episodes = 10 # while True: # updates += 1 # #raw_input("next episode") # print "lerne episode:",updates # experiment.doEpisodes(episodes) ## print "lernen beendet, starte testlauf" # env.reset() ## if updates > 0: ## experiment.doInteractions(20) ## rewsum = 0 ## rewlist = [] ## for i in range (0,100): ## rew = task.performAction(net.activate(task.getObservation())) ## ## task.getObservation() ## rewlist.append(rew) ## rewsum += rew # #print " testlauf: ",updates,"aktion: ",i+1," reward: ",rew ## print "-> summe = ",rewsum, " avg: ",rewsum / 100.0 # #print "episodes:",updates," rewsum: ",rewsum," testrewards:",rewlist ## #x = "episode:" + updates + " testrewards:" + rewlist ## #o.write(x) ## for i in range(0,len(rewlist)): ## x = (updates % 20) - 10 ## y = i - 10 ## z = rewlist[i] ## #g.plot((x,y,z),x=1, y=2, z=3) # # #g.doplot() # # # # #print "-------------------------------------------------------------------" print "finished grabbingTest"