def qLearning(world, userMap, maxX, maxY, discount=0.9, MAX_ITERATIONS=1000): gen = BasicGridWorld(userMap, maxX, maxY) domain = gen.generateDomain() initialState = gen.getExampleState(domain); rf = BasicRewardFunction(maxX, maxY, userMap) tf = BasicTerminalFunction(maxX, maxY) env = SimulatedEnvironment(domain, rf, tf, initialState) visualizeInitialGridWorld(domain, gen, env) hashingFactory = SimpleHashableStateFactory() timing = defaultdict(list) rewards = defaultdict(list) steps = defaultdict(list) convergence = defaultdict(list) allStates = getAllStates(domain, rf, tf, initialState) MAX_ITERATIONS = MAX_ITERATIONS NUM_INTERVALS = MAX_ITERATIONS; iterations = range(1, MAX_ITERATIONS + 1) qInit = 0 for lr in [0.01, 0.1, 0.5]: for epsilon in [0.3, 0.5, 0.7]: last10Chg = deque([10] * 10, maxlen=10) Qname = 'Q-Learning L{:0.2f} E{:0.1f}'.format(lr, epsilon) #agent = QLearning(domain, discount, hashingFactory, qInit, lr, epsilon, 300) agent = QLearning(domain, discount, hashingFactory, qInit, lr, epsilon) agent.setDebugCode(0) print("*** {}: {}".format(world, Qname)) for nIter in iterations: if nIter % 200 == 0: print('Iteration: {}'.format(nIter)) startTime = clock() #ea = agent.runLearningEpisode(env, 300) ea = agent.runLearningEpisode(env) env.resetEnvironment() agent.initializeForPlanning(rf, tf, 1) p = agent.planFromState(initialState) # run planning from our initial state endTime = clock() timing[Qname].append((endTime-startTime)*1000) last10Chg.append(agent.maxQChangeInLastEpisode) convergence[Qname].append(sum(last10Chg)/10.) # evaluate the policy with one roll out visualize the trajectory runEvals(initialState, p, rewards[Qname], steps[Qname], rf, tf, evalTrials=1) if nIter % 1000 == 0: dumpPolicyMap(MapPrinter.printPolicyMap(allStates, p, gen.getMap()), '{} {} Iter {} Policy Map.pkl'.format(world, Qname, nIter)) simpleValueFunctionVis(agent, p, initialState, domain, hashingFactory, Qname) dumpCSV(nIter, timing[Qname], rewards[Qname], steps[Qname], convergence[Qname], world, Qname)
def pIteration(world, userMap, maxX, maxY, discount=0.99, MAX_ITERATIONS=100): gen = BasicGridWorld(userMap, maxX, maxY) domain = gen.generateDomain() initialState = gen.getExampleState(domain); rf = BasicRewardFunction(maxX, maxY, userMap) tf = BasicTerminalFunction(maxX, maxY) env = SimulatedEnvironment(domain, rf, tf, initialState) visualizeInitialGridWorld(domain, gen, env) hashingFactory = SimpleHashableStateFactory() timing = defaultdict(list) rewards = defaultdict(list) steps = defaultdict(list) convergence = defaultdict(list) policy_converged = defaultdict(list) last_policy = defaultdict(list) allStates = getAllStates(domain, rf, tf, initialState) print("*** {} Policy Iteration Analysis".format(world)) MAX_ITERATIONS = MAX_ITERATIONS iterations = range(1, MAX_ITERATIONS + 1) pi = PolicyIteration(domain,rf,tf,discount,hashingFactory,-1,1,1); pi.setDebugCode(0) for nIter in iterations: startTime = clock() #pi = PolicyIteration(domain,rf,tf,discount,hashingFactory,-1,1, nIter); #pi.setDebugCode(0) # run planning from our initial state p = pi.planFromState(initialState); endTime = clock() timing['Policy'].append((endTime-startTime)*1000) convergence['Policy'].append(pi.lastPIDelta) # evaluate the policy with one roll out visualize the trajectory runEvals(initialState, p, rewards['Policy'], steps['Policy'], rf, tf, evalTrials=1) if nIter == 1 or nIter == 50: simpleValueFunctionVis(pi, p, initialState, domain, hashingFactory, "Policy Iteration{}".format(nIter)) policy = pi.getComputedPolicy() allStates = pi.getAllStates() current_policy = [[(action.ga, action.pSelection) for action in policy.getActionDistributionForState(state)] for state in allStates] policy_converged['Policy'].append(current_policy == last_policy) last_policy = current_policy simpleValueFunctionVis(pi, p, initialState, domain, hashingFactory, "Policy Iteration{}".format(nIter)) dumpPolicyMap(MapPrinter.printPolicyMap(allStates, p, gen.getMap()), world + ' Policy Iteration Policy Map.pkl') dumpCSVp(iterations, timing['Policy'], rewards['Policy'], steps['Policy'],convergence['Policy'], world, 'Policy', policy_converged['Policy'])
def vIteration(world, userMap, maxX, maxY, discount=0.99, MAX_ITERATIONS=100): gen = BasicGridWorld(userMap, maxX, maxY) domain = gen.generateDomain() initialState = gen.getExampleState(domain); rf = BasicRewardFunction(maxX, maxY, userMap) tf = BasicTerminalFunction(maxX, maxY) env = SimulatedEnvironment(domain, rf, tf, initialState) visualizeInitialGridWorld(domain, gen, env) hashingFactory = SimpleHashableStateFactory() timing = defaultdict(list) rewards = defaultdict(list) steps = defaultdict(list) convergence = defaultdict(list) allStates = getAllStates(domain, rf, tf, initialState) print("*** {} Value Iteration Analysis".format(world)) MAX_ITERATIONS = MAX_ITERATIONS iterations = range(1, MAX_ITERATIONS + 1) vi = ValueIteration(domain, rf, tf, discount, hashingFactory, -1, 1); vi.setDebugCode(0) vi.performReachabilityFrom(initialState) vi.toggleUseCachedTransitionDynamics(False) timing['Value'].append(0) for nIter in iterations: startTime = clock() vi.runVI() p = vi.planFromState(initialState); endTime = clock() timing['Value'].append((endTime-startTime)*1000) convergence['Value'].append(vi.latestDelta) # evaluate the policy with evalTrials roll outs runEvals(initialState, p, rewards['Value'], steps['Value'], rf, tf, evalTrials=1) if nIter == 1 or nIter == 50: simpleValueFunctionVis(vi, p, initialState, domain, hashingFactory, "Value Iteration {}".format(nIter)) simpleValueFunctionVis(vi, p, initialState, domain, hashingFactory, "Value Iteration {}".format(nIter)) dumpPolicyMap(MapPrinter.printPolicyMap(allStates, p, gen.getMap()), world + ' Value Iteration Policy Map.pkl') dumpCSV(nIter, timing['Value'][1:], rewards['Value'], steps['Value'], convergence['Value'], world, 'Value')
flag = True timing['Value'].append(0) for nIter in iterations: startTime = clock() vi.runVI() #timing['Value'].append((clock()-startTime) * 1000) timing['Value'].append(timing['Value'][-1] + clock() - startTime) p = vi.planFromState(initialState) convergence['Value'].append(vi.latestDelta) # evaluate the policy with evalTrials roll outs runEvals(initialState, p, rewards['Value'], steps['Value']) #if nIter == 1: #simpleValueFunctionVis(vi, p, initialState, domain, hashingFactory, "Value Iteration {}".format(nIter)) if (vi.latestDelta < 1e-6) and flag: flag = False simpleValueFunctionVis(vi, p, initialState, domain, hashingFactory, "Value Iteration {}".format(nIter)) dumpPolicyMap( MapPrinter.printPolicyMap(allStates, p, gen.getMap()), 'Value {} Iter {} Policy Map.pkl'.format(world, nIter)) # if vi.latestDelta <1e-6: # break print "\n\n\n" simpleValueFunctionVis(vi, p, initialState, domain, hashingFactory, "Value Iteration {}".format(nIter)) dumpCSV(nIter, timing['Value'][1:], rewards['Value'], steps['Value'], convergence['Value'], world, 'Value') pi = PolicyIteration(domain, rf, tf, discount, hashingFactory, 1e-3, 10, 1) pi.toggleUseCachedTransitionDynamics(False) print "//{} Policy Iteration Analysis//".format(world) flag = True
lr, qInit, epsilon) agent = QLearning(domain, discount, hashingFactory, qInit, lr, epsilon, 300) #agent.setLearningRateFunction(SoftTimeInverseDecayLR(1.,0.)) agent.setDebugCode(0) print "//{} {} Iteration Analysis//".format(world, Qname) for nIter in iterations: if nIter % 50 == 0: print(nIter) startTime = clock() ea = agent.runLearningEpisode(env, 300) if len(timing[Qname]) > 0: timing[Qname].append(timing[Qname][-1] + clock() - startTime) else: timing[Qname].append(clock() - startTime) env.resetEnvironment() agent.initializeForPlanning(rf, tf, 1) p = agent.planFromState( initialState) # run planning from our initial state last10Chg.append(agent.maxQChangeInLastEpisode) convergence[Qname].append(sum(last10Chg) / 10.) # evaluate the policy with one roll out visualize the trajectory runEvals(initialState, p, rewards[Qname], steps[Qname]) if nIter == 9 or nIter == 100 or nIter == 1066 or nIter == 2900: simpleValueFunctionVis(agent, p, initialState, domain, hashingFactory, "Q-learning Iteration {}".format(nIter)) raw_input('Press enter to continue') print("C'est fin")
ea = agent.runLearningEpisode(env) env.resetEnvironment() agent.initializeForPlanning(rf, tf, 1) p = agent.planFromState( initialState) # run planning from our initial state timing[Qname].append((clock() - startTime) * 1000) last10Rewards.append(agent.maxQChangeInLastEpisode) convergence[Qname].append(sum(last10Rewards) / 10.) # evaluate the policy with one roll out visualize the trajectory runEvals(initialState, p, rewards[Qname], steps[Qname]) # if (lr == 0.9 and epsilon == 0.5 and nIter == 1): # simpleValueFunctionVis(agent, p, initialState, domain, hashingFactory, Qname + " Iter: 1") # # Uncomment to visualize environment after first iteration # if nIter == 1: # simpleValueFunctionVis(agent, p, initialState, domain, hashingFactory, Qname + " {}".format(nIter)) # break MapPrinter.printPolicyMap( getAllStates(domain, rf, tf, initialState), p, gen.getMap()) print "\n\n\n" simpleValueFunctionVis(agent, p, initialState, domain, hashingFactory, Qname + " {}".format(nIter)) dumpCSV(iterations, timing[Qname], rewards[Qname], steps[Qname], convergence[Qname], world, Qname) print('done') # if lr ==0.9 and epsilon ==0.3: # simpleValueFunctionVis(agent, p, initialState, domain, hashingFactory, Qname+' {}'.format(nIter)) # input('s')
for state in allStates } if nIter == 1: convergence['Policy'].append(18) else: convergence['Policy'].append( comparePolicies(last_policy, current_policy)) print('convergence policy = ' + str(comparePolicies(last_policy, current_policy))) last_policy = current_policy # evaluate the policy with evalTrials roll outs runEvals(initialState, p, rewards['Value'], steps['Value']) if nIter == 1: simpleValueFunctionVis( vi, p, initialState, domain, hashingFactory, "Value Iter {} Disc {}".format(nIter, discount)) dumpPolicyMap( MapPrinter.printPolicyMap(allStates, p, gen.getMap()), 'Value {} Iter {} Disc {} Policy Map.pkl'.format( world, nIter, str(discount))) if nIter % 2 == 1: simpleValueFunctionVis( vi, p, initialState, domain, hashingFactory, "Value Iter {} Disc {}".format(nIter, discount)) dumpPolicyMap( MapPrinter.printPolicyMap(allStates, p, gen.getMap()), 'Value {} Iter {} Disc {} Policy Map.pkl'.format( world, nIter, str(discount))) if nIter == 5 or vi.latestDelta < 1e-6: dumpPolicyMap(
timing['Policy'].append(timing['Policy'][-1] + clock() - startTime) policy = pi.getComputedPolicy() current_policy = { state: policy.getAction(state).toString() for state in allStates } convergence['Policy2'].append(pi.lastPIDelta) if nIter == 1: convergence['Policy'].append(999) else: convergence['Policy'].append( comparePolicies(last_policy, current_policy)) last_policy = current_policy runEvals(initialState, p, rewards['Policy'], steps['Policy']) if nIter == 5 or convergence['Policy2'][-1] < 1e-6: simpleValueFunctionVis(pi, p, initialState, domain, hashingFactory, "Policy Iteration {}".format(nIter)) dumpPolicyMap( MapPrinter.printPolicyMap(allStates, p, gen.getMap()), 'Policy {} Iter {} Policy Map.pkl'.format(world, nIter)) if convergence['Policy2'][-1] < 1e-6: break MapPrinter.printPolicyMap(pi.getAllStates(), p, gen.getMap()) print "\n\n\n" dumpCSV(nIter, timing['Policy'][1:], rewards['Policy'], steps['Policy'], convergence['Policy2'], world, 'Policy') #raise MAX_ITERATIONS = NUM_INTERVALS = MAX_ITERATIONS * 10 increment = MAX_ITERATIONS / NUM_INTERVALS iterations = range(1, MAX_ITERATIONS + 1) for qInit in [-100, 0, 100]:
print "//Hard Value Iteration Analysis//" for nIter in iterations: startTime = clock() vi = ValueIteration(domain, rf, tf, discount, hashingFactory, -1, nIter) #//Added a very high delta number in order to guarantee that value iteration occurs the max number of iterations for comparison with the other algorithms. # run planning from our initial state vi.setDebugCode(0) p = vi.planFromState(initialState) timing['Value'].append(clock() - startTime) convergence['Value'].append(vi.latestDelta) # evaluate the policy with evalTrials roll outs runEvals(initialState, p, rewards['Value'], steps['Value']) if nIter == 2 or nIter == 20 or nIter == 100: simpleValueFunctionVis(vi, p, initialState, domain, hashingFactory, 'Value Iteration %s' % (nIter)) MapPrinter.printPolicyMap(vi.getAllStates(), p, gen.getMap()) print "\n\n\n" # simpleValueFunctionVis(vi, p, initialState, domain, hashingFactory, 'Value Iteration %s' % (nIter)) # input('c') dumpCSV(iterations, timing['Value'], rewards['Value'], steps['Value'], convergence['Value'], world, 'Value', discount=discount) print "//Hard Policy Iteration Analysis//"
startTime = clock() vi.runVI() timing['Value'].append(timing['Value'][-1] + clock() - startTime) p = vi.planFromState(initialState) convergence['Value'].append(vi.latestDelta) # evaluate the policy with evalTrials roll outs runEvals(initialState, p, rewards['Value'], steps['Value']) if nIter == 5 or vi.latestDelta < 1e-6: #simpleValueFunctionVis(vi, p, initialState, domain, hashingFactory, "Value Iteration {}".format(nIter)) dumpPolicyMap( MapPrinter.printPolicyMap(allStates, p, gen.getMap()), 'Hard/VI/Value {} Iter {} Policy Map.pkl'.format( world, nIter)) if nIter == 100: simpleValueFunctionVis(vi, p, initialState, domain, hashingFactory, "Value Iteration {}".format(nIter)) dumpPolicyMap( MapPrinter.printPolicyMap(allStates, p, gen.getMap()), 'Hard/VI/Value {} Iter {} Policy Map.pkl'.format( world, nIter)) #if vi.latestDelta <1e-6: # break print "\n\n\n" dumpCSV(nIter, timing['Value'][1:], rewards['Value'], steps['Value'], convergence['Value'], world, 'Value') time.sleep(20) pi = PolicyIteration(domain, rf, tf, discount, hashingFactory, 1e-3, 10, 1) pi.toggleUseCachedTransitionDynamics(False) print "//{} Policy Iteration Analysis//".format(world)
print "//Easy Value Iteration Analysis//" for nIter in iterations: startTime = clock() vi = ValueIteration(domain, rf, tf, discount, hashingFactory, -1, nIter); # //Added a very high delta number in order to guarantee that value iteration occurs the max number of iterations for comparison with the other algorithms. # run planning from our initial state vi.setDebugCode(0) p = vi.planFromState(initialState); timing['Value'].append(clock() - startTime) convergence['Value'].append(vi.latestDelta) # evaluate the policy with evalTrials roll outs runEvals(initialState, p, rewards['Value'], steps['Value']) if nIter == 1 or nIter == 25 or nIter == 50 or nIter == 15 or nIter == 100 : simpleValueFunctionVis(vi, p, initialState, domain, hashingFactory, 'Value Iteration %s' % (nIter)) MapPrinter.printPolicyMap(vi.getAllStates(), p, gen.getMap()); print "\n\n\n" # simpleValueFunctionVis(vi, p, initialState, domain, hashingFactory, 'Value Iteration %s' % (nIter)) dumpCSV(iterations, timing['Value'], rewards['Value'], steps['Value'], convergence['Value'], world, 'Value', discount=discount) # print "//Easy Policy Iteration Analysis//" for nIter in iterations: startTime = clock() pi = PolicyIteration(domain, rf, tf, discount, hashingFactory, -1, 1, nIter); # //Added a very high delta number in order to guarantee that value iteration occurs the max number of iterations for comparison with the other algorithms. # run planning from our initial state
for nIter in iterations: startTime = clock() vi.runVI() timing['Value'].append(timing['Value'][-1] + clock() - startTime) p = vi.planFromState(initialState) convergence['Value'].append(vi.latestDelta) # evaluate the policy with evalTrials roll outs runEvals(initialState, p, rewards['Value'], steps['Value']) if nIter == 10: dumpPolicyMap( MapPrinter.printPolicyMap(allStates, p, gen.getMap()), 'out/hard/Value {} Iter {} Policy Map.pkl'.format( world, nIter)) if vi.latestDelta < 1e-5: simpleValueFunctionVis( vi, p, initialState, domain, hashingFactory, "Converged Value Iteration {}".format(nIter)) dumpPolicyMap( MapPrinter.printPolicyMap(allStates, p, gen.getMap()), 'out/hard/Converged Value {} Iter {} Policy Map.pkl'.format( world, nIter)) break elif nIter == MAX_ITERATIONS: simpleValueFunctionVis(vi, p, initialState, domain, hashingFactory, "Final Value Iteration {}".format(nIter)) dumpPolicyMap( MapPrinter.printPolicyMap(allStates, p, gen.getMap()), 'out/hard/Final Value {} Iter {} Policy Map.pkl'.format( world, nIter)) print "\n\n\n"