Пример #1
0
def markovchainesti(mdp, start_state=0, epsilon=4, randomseed=None, algo="episodic", delta=0.1, bounds="MBAE"):

	if(randomseed is not None):
		np.random.seed(randomseed)
	policies = np.array(getPolicies(mdp.numStates, mdp.numActions))
	numPolicies = len(policies)
	print("Total policies: ", numPolicies)
	H = int(math.log(epsilon/(2*mdp.Vmax*(1 - mdp.discountFactor)))/math.log(mdp.discountFactor))
	print("Chosen value of H is : ", H)

	
	## Initializations
	it = 0
	samples = 0
	initial_iterations = 1 * mdp.numStates * mdp.numActions
	R_s_a = np.zeros((mdp.numStates, mdp.numActions))
	R_s_a_sprime = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates))
	N_s_a_sprime = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates), dtype=np.int)
	N_s_a = np.zeros((mdp.numStates, mdp.numActions), dtype=np.int)
	P_s_a_sprime = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates))
	Qupper = mdp.Vmax*np.ones((numPolicies, mdp.numStates))
	QupperMBAE = mdp.Vmax*np.ones((numPolicies, mdp.numStates))
	Qlower = np.zeros((numPolicies, mdp.numStates))
	Qstar = (mdp.Vmax/2)*np.ones((numPolicies, mdp.numStates))
	QstarMBAE = (mdp.Vmax/2)*np.ones((numPolicies, mdp.numStates))
	QlowerMBAE = np.zeros((numPolicies, mdp.numStates))
	
	V_true = np.zeros(mdp.numStates)
	V_estimate = np.zeros(mdp.numStates)
	V_error = np.zeros(MAX_ITERATION_LIMIT + 1)

	P_tilda = np.zeros((numPolicies, mdp.numStates,mdp.numStates))
	P_lower_tilda = np.zeros((numPolicies, mdp.numStates,mdp.numStates))
	VlowerMBAE = np.zeros((numPolicies, mdp.numStates))
	VupperMBAE = mdp.Vmax*np.ones((numPolicies, mdp.numStates))
	Vstar = (mdp.Vmax/2)*np.ones((numPolicies, mdp.numStates))
	discovered_states = set([start_state])
	deltadeltaV = np.zeros((mdp.numStates))
	state_dist = np.zeros((mdp.numStates))
	state_dist[start_state] = 1
	print(state_dist)


	while it < initial_iterations:
		for state in range(mdp.numStates):
			for act in range(mdp.numActions):
				it = it + 1
				ss, rr = mdp.simulate(state, act)
				print("Sampling", state, act, rr, ss)
				R_s_a[state][act] = (rr + R_s_a[state][act] * N_s_a[state][act])/(N_s_a[state][act] + 1)
				R_s_a_sprime[state][act][ss] = rr
				N_s_a[state][act] = N_s_a[state][act] + 1
				N_s_a_sprime[state][act][ss] = N_s_a_sprime[state][act][ss] + 1
				for s2 in range(mdp.numStates):
					P_s_a_sprime[state][act][s2] = (float)(N_s_a_sprime[state][act][s2])/N_s_a[state][act]
	samples += initial_iterations

	if(algo=="use_ddv"):
		ff = open(f"logs/{mdp.filename}-markovddv{randomseed}.txt", 'w')
	elif(algo=="episodic"):
		ff = open(f"logs/{mdp.filename}-markoveps{randomseed}.txt", 'w')
	elif(algo=="uniform"):
		ff = open(f"logs/{mdp.filename}-markovuni{randomseed}.txt", 'w')
	elif(algo=="greedyMBAE"):
		ff = open(f"logs/{mdp.filename}-markovMBAE{randomseed}.txt", 'w')
	elif(algo=="greedyMBIE"):
		ff = open(f"logs/{mdp.filename}-markovMBIE{randomseed}.txt", 'w')
	elif(algo=="mybest"):
		ff = open(f"logs/{mdp.filename}-markovbest{randomseed}.txt", 'w')
	elif(algo=="runcertainty"):
		ff = open(f"logs/{mdp.filename}-markovruncertainty{randomseed}.txt", 'w')
	elif(algo=="unc_contri"):
		ff = open(f"logs/{mdp.filename}-markovunc_contri{randomseed}.txt", 'w')

	
	while samples<MAX_ITERATION_LIMIT/2:
		
		p = 0
		current_policy = fixedPolicy

		for i in range(mdp.numStates):
			# print "For state ", i, " doing UpperP"
			if(N_s_a[i][current_policy[i]]>0):
				P_tilda[p][i] = UpperP(
					i,
					current_policy[i],
					delta,
					N_s_a_sprime[i][current_policy[i]],
					mdp.numStates,
					Qupper[p],
					False
					)
				P_lower_tilda[p][i] = LowerP(
					i,
					current_policy[i],
					delta,
					N_s_a_sprime[i][current_policy[i]],
					mdp.numStates,
					Qlower[p],
					False
					)

		Qupper[p] = itConvergencePolicy(
			Qupper[p],
			getRewards(R_s_a, current_policy),
			P_tilda[p],
			mdp.discountFactor,
			epsilon,
			converge_iterations,
			epsilon_convergence
			)

		Qlower[p] = itConvergencePolicy(
			Qlower[p],
			getRewards(R_s_a, current_policy),
			P_lower_tilda[p],
			mdp.discountFactor,
			epsilon,
			converge_iterations,
			epsilon_convergence
			)	

		Qstar[p] = itConvergencePolicy(
			Qstar[p],
			getRewards(R_s_a, current_policy),
			getProb(P_s_a_sprime, current_policy),
			mdp.discountFactor,
			epsilon,
			converge_iterations,
			epsilon_convergence
			)	

		for internal in range(converge_iterations):
			
			oldQlowerMBAE = np.copy(QlowerMBAE[p][start_state])
			for state in range(mdp.numStates):
				act = current_policy[state]
				firstterm = R_s_a[state][act]
				secondterm = mdp.discountFactor*np.sum(VupperMBAE[p]*(P_s_a_sprime[state][act]))
				lower_secondterm = mdp.discountFactor*np.sum(VlowerMBAE[p]*(P_s_a_sprime[state][act]))
				star_secondterm = mdp.discountFactor*np.sum(Vstar[p]*(P_s_a_sprime[state][act]))
				thirdterm = mdp.Vmax*math.sqrt((math.log(c*(samples**2)*mdp.numStates*1)-math.log(delta))/N_s_a[state][act])
				QupperMBAE[p][state] = firstterm + secondterm + thirdterm
				QlowerMBAE[p][state] = firstterm + lower_secondterm - thirdterm
				QstarMBAE[p][state] = firstterm + star_secondterm
				VupperMBAE[p][state] = QupperMBAE[p][state]
				VlowerMBAE[p][state] = QlowerMBAE[p][state]
				Vstar[p][state] = QstarMBAE[p][state]
			if(np.linalg.norm(oldQlowerMBAE-QlowerMBAE[p][start_state])<=epsilon_convergence):
				break
		policy1Index = 0
		
		h = 0
		policy1 = fixedPolicy
		state = start_state
		# print "samples", samples
		if (samples%10000)<100:
			if(verbose==0):
				ff.write(str(samples))
				ff.write('\t')
				if(plot_vstar):
					ff.write(str(Vstar[policy1Index][start_state]))
				else:
					ff.write(str(QupperMBAE[policy1Index][start_state]-QlowerMBAE[policy1Index][start_state]))#-epsilon*(1-mdp.discountFactor)/2 
				print(samples, QupperMBAE[policy1Index][start_state]-QlowerMBAE[policy1Index][start_state])
				ff.write('\n')
			else:
				print(samples)
				print(QupperMBAE[:,start_state], QlowerMBAE[:,start_state])

		polList = [policy1Index]

		# print(R_s_a.shape)
		# print(mdp.rewards)
		# print(mdp.transitionProbabilities)
		# print(getAverageRewards(mdp.numStates, mdp.numActions, mdp.rewards, mdp.transitionProbabilities))

		# print(getRewards(R_s_a, current_policy).shape)

		# print(getRewards(getAverageRewards(mdp.numStates, mdp.numActions, mdp.rewards, mdp.transitionProbabilities), current_policy).shape)

		# print(mdp.transitionProbabilities.shape)
		# print(P_s_a_sprime.shape)

		# print(getProb(mdp.transitionProbabilities, current_policy))
		# print(getProb(P_s_a_sprime, current_policy))


		# V_true = itConvergencePolicy(V_true,
		# 	getRewards(getAverageRewards(mdp.numStates, mdp.numActions, mdp.rewards, mdp.transitionProbabilities), current_policy),
		# 	getProb(mdp.transitionProbabilities, current_policy),
		# 	mdp.discountFactor,
		# 	epsilon,
		# 	converge_iterations,
		# 	epsilon_convergence
		# 	)

		# V_estimate	= itConvergencePolicy(V_estimate,
		# 	getRewards(R_s_a, current_policy),
		# 	getProb(P_s_a_sprime, current_policy),
		# 	mdp.discountFactor,
		# 	epsilon,
		# 	converge_iterations,
		# 	epsilon_convergence
		# 	)

		#print(V_estimate)
		V_true = itConvergencePolicy(V_true,
			getRewards(getAverageRewards(mdp.numStates, mdp.numActions, mdp.rewards, mdp.transitionProbabilities), current_policy),
			getProb(mdp.transitionProbabilities, current_policy),
			mdp.discountFactor,
			epsilon,
			converge_iterations,
			epsilon_convergence
			)
	


		if(algo=="use_ddv"):
			## Caclulate V for all states
			for pnum in polList:
				policiesfddv = fixedPolicy
				# print "Getting DDV values"
				for st in list(discovered_states):
					ac = policiesfddv[st]
					#### Compute del del V
					deltadeltaV[st] = CalculateDelDelV(
						st,
						ac,
						mdp,
						N_s_a_sprime,
						QupperMBAE[pnum],
						QlowerMBAE[pnum],
						None,
						None,
						start_state,
						P_s_a_sprime,
						P_tilda[pnum],
						P_lower_tilda[pnum],
						R_s_a,
						epsilon,
						delta,
						converge_iterations,
						epsilon_convergence,
						policiesfddv
						)

				# print deltadeltaV
				cs = np.argmax(deltadeltaV)
				ca = policiesfddv[cs]
				# print deltadeltaV, cs, ca
				# print deltadeltaV, policy1, policy2
				# print "Found max state for DDV: ",cs,ca
				# time.sleep(0.1)
				ss, rr = mdp.simulate(cs, ca)
				# print "Policy is ", policiesfddv
				# print "Sampling ", cs, ca

				time.sleep(0.1)	
				samples = samples +  1
				discovered_states.add(ss)
				R_s_a[cs][ca] = (rr + R_s_a[cs][ca]*N_s_a[cs][ca])/(N_s_a[cs][ca]+1)
				N_s_a[cs][ca] += 1
				N_s_a_sprime[cs][ca][ss] += 1
				# P_s_a_sprime = np.copy(N_s_a_sprime)
				for s2 in range(mdp.numStates):
					P_s_a_sprime[cs][ca][s2] = (float)(N_s_a_sprime[cs][ca][s2])/N_s_a[cs][ca]
		
		elif(algo == "episodic"):
			while h<H:
				act = policy1[state]
				# print "------>",current_state, current_action
				ss, rr = mdp.simulate(state, act)
				# print "Sampling ", state, act
				samples+=1
				R_s_a[state][act] = (rr + R_s_a[state][act]*N_s_a[state][act])/(N_s_a[state][act]+1)
				N_s_a[state][act] += 1
				N_s_a_sprime[state][act][ss] += 1
				# P_s_a_sprime = np.copy(N_s_a_sprime)
				for s2 in range(mdp.numStates):
					P_s_a_sprime[state][act][s2] = (float)(N_s_a_sprime[state][act][s2])/N_s_a[state][act]
				state = ss
				h+=1

				V_error[samples] = ErrorV(mdp, V_true, V_estimate, R_s_a, P_s_a_sprime, current_policy, start_state, epsilon, converge_iterations, epsilon_convergence)

		elif(algo == "uniform"):
			for st in range(mdp.numStates):
				ac = fixedPolicy[st]
				ss, rr = mdp.simulate(st, ac)
				# print "Sampling ", st, ac
				samples += 1
				R_s_a[st][ac] = (rr + R_s_a[st][ac]*N_s_a[st][ac])/(N_s_a[st][ac]+1)
				N_s_a[st][ac] += 1
				N_s_a_sprime[st][ac][ss] += 1
				for s2 in range(mdp.numStates):
					P_s_a_sprime[st][ac][s2] = (float)(N_s_a_sprime[st][ac][s2])/N_s_a[st][ac]

				V_error[samples] = ErrorV(mdp, V_true, V_estimate, R_s_a, P_s_a_sprime, current_policy, start_state, epsilon, converge_iterations, epsilon_convergence)


		elif(algo == "runcertainty"):

			deltaW = np.zeros(mdp.numStates)
			mu = np.zeros(mdp.numStates)
			D = np.zeros(mdp.numStates)

			mu[start_state] = 1

			for t in range(H):
				D = D + (mdp.discountFactor**t) * mu
				mu = prob_step(mu, P_s_a_sprime, fixedPolicy)

			for st in range(mdp.numStates):
				#transition uncertainty for given s, pi(s)
				deltaW[st] = delW(st, fixedPolicy[st], delta, N_s_a_sprime[st][fixedPolicy[st]], mdp.numStates, False)

			st = np.argmax(deltaW * D)
			# if samples % 100 == 0:
			# 	print deltaW, D, deltaW * D, np.argmax(deltaW * D)
			ac = fixedPolicy[st]
			ss, rr = mdp.simulate(st, ac)
			# print "Sampling ", st, ac, rr, ss
			samples += 1
			R_s_a[st][ac] = (rr + R_s_a[st][ac]*N_s_a[st][ac])/(N_s_a[st][ac]+1)
			N_s_a[st][ac] += 1
			N_s_a_sprime[st][ac][ss] += 1
			for s2 in range(mdp.numStates):
				P_s_a_sprime[st][ac][s2] = (float)(N_s_a_sprime[st][ac][s2])/N_s_a[st][ac]

			V_error[samples] = ErrorV(mdp, V_true, V_estimate, R_s_a, P_s_a_sprime, current_policy, start_state, epsilon, converge_iterations, epsilon_convergence)

		elif(algo == "unc_contri"):
			mu = np.zeros(mdp.numStates)
			D = np.zeros(mdp.numStates)

			z_quantile = 2.0

			mu[start_state] = 1


			transitionEstimate = getProb(P_s_a_sprime, fixedPolicy)
			rewardEstimate = getRewards(R_s_a, fixedPolicy)
			
			for t in range(H):
				D = D + (mdp.discountFactor**t) * mu

				mu = np.dot(mu, transitionEstimate)

			V_esti = np.dot(rewardEstimate, D)

			V_uncertainty = np.zeros(mdp.numStates)

			for s in range(mdp.numStates):

				transitionEstimateUpper = np.copy(transitionEstimate)
				transitionEstimateLower = np.copy(transitionEstimate)

				for sprime in range(mdp.numStates):
					#Wilson score interval confidence bounds for each transition

					muTerm = (transitionEstimate[s][sprime] + (z_quantile**2)/(2*N_s_a[s][fixedPolicy[s]])) / (1 + (z_quantile**2)/N_s_a[s][fixedPolicy[s]])
					devTerm = (z_quantile / (1 + (z_quantile**2)/N_s_a[s][fixedPolicy[s]])) * math.sqrt( (transitionEstimate[s][sprime]*(1-transitionEstimate[s][sprime])/N_s_a[s][fixedPolicy[s]]) + (z_quantile**2)/(4*N_s_a[s][fixedPolicy[s]]**2))
					
					# print(transitionEstimate[s][sprime], muTerm, devTerm)
					# if s == 1:
					# 	print "mu", muTerm
					# 	print "dev", devTerm
					# 	print (z_quantile / (1 + (z_quantile**2)/N_s_a[s][fixedPolicy[s]]))
					# 	print z_quantile, N_s_a[s][fixedPolicy[s]]
					# 	print (transitionEstimate[s][sprime]*(1-transitionEstimate[s][sprime])/N_s_a[s][fixedPolicy[s]])
					# 	print (z_quantile**2)/(4*N_s_a[s][fixedPolicy[s]]**2)
					transitionEstimateUpper[s][sprime] = muTerm + devTerm
					transitionEstimateLower[s][sprime] = muTerm - devTerm

					#print(transitionEstimateUpper[s][sprime], transitionEstimateLower[s][sprime] )
					#print("_____________")
				# if samples > 49500:
				# 	print(s, N_s_a[s][fixedPolicy[s]])
				# 	print(transitionEstimate)
				# 	print(transitionEstimateUpper)
				# 	print(transitionEstimateLower)

				upperD = np.zeros(mdp.numStates)
				lowerD = np.zeros(mdp.numStates)

				uppermu = np.zeros(mdp.numStates)
				uppermu[start_state] = 1
				lowermu = np.zeros(mdp.numStates)
				lowermu[start_state] = 1

				for t in range(H):
					upperD = upperD + (mdp.discountFactor**t) * uppermu

					uppermu = np.dot(uppermu, transitionEstimateUpper)

					lowerD = lowerD + (mdp.discountFactor**t) * lowermu

					lowermu = np.dot(lowermu, transitionEstimateLower)
				# if samples > 49500:
				# 	print("___________")
				# 	print(upperD)
				# 	print(lowerD)
				# 	print(rewardEstimate)
				# 	print(np.dot(rewardEstimate, upperD) - np.dot(rewardEstimate, lowerD))

				# print(V_uncertainty)
				V_uncertainty[s] = abs(np.dot(rewardEstimate, upperD) - np.dot(rewardEstimate, lowerD))

			# if samples > 49500:
			# 	print("V_unc", V_uncertainty)
			st = np.argmax(V_uncertainty)	

			ac = fixedPolicy[st]
			ss, rr = mdp.simulate(st, ac)
			
			# if samples > 49500:
			# 	print(f"Sample no. {samples}", st, ac, rr, ss)
			samples += 1
			R_s_a[st][ac] = (rr + R_s_a[st][ac]*N_s_a[st][ac])/(N_s_a[st][ac]+1)
			N_s_a[st][ac] += 1
			N_s_a_sprime[st][ac][ss] += 1
			for s2 in range(mdp.numStates):
				P_s_a_sprime[st][ac][s2] = (float)(N_s_a_sprime[st][ac][s2])/N_s_a[st][ac]

			V_error[samples] = ErrorV(mdp, V_true, V_estimate, R_s_a, P_s_a_sprime, current_policy, start_state, epsilon, converge_iterations, epsilon_convergence)
			# if samples > 49500:
			# 	print(f"Error = {V_error[samples]}")
			# if V_error[samples]/V_true[start_state] > -0.5 :
			# 	print(samples)
			# if V_error[samples]/V_true[start_state] > 0.35 and V_error[samples]> V_error[samples-1]:
			# 	print samples

		elif(algo == "greedyMBAE"):

			st = max(range(mdp.numStates), key=lambda x: VupperMBAE[0][x]-VlowerMBAE[0][x])
			ac = fixedPolicy[st]
			ss, rr = mdp.simulate(st, ac)
			# print "Sampling ", st, ac
			samples += 1
			R_s_a[st][ac] = (rr + R_s_a[st][ac]*N_s_a[st][ac])/(N_s_a[st][ac]+1)
			N_s_a[st][ac] += 1
			N_s_a_sprime[st][ac][ss] += 1
			for s2 in range(mdp.numStates):
				P_s_a_sprime[st][ac][s2] = (float)(N_s_a_sprime[st][ac][s2])/N_s_a[st][ac]

		elif(algo == "greedyMBIE"):

			st = max(range(mdp.numStates), key=lambda x: Qupper[0][x]-Qlower[0][x])
			ac = fixedPolicy[st]
			ss, rr = mdp.simulate(st, ac)
			# print "Sampling ", st, ac
			samples += 1
			R_s_a[st][ac] = (rr + R_s_a[st][ac]*N_s_a[st][ac])/(N_s_a[st][ac]+1)
			N_s_a[st][ac] += 1
			N_s_a_sprime[st][ac][ss] += 1
			for s2 in range(mdp.numStates):
				P_s_a_sprime[st][ac][s2] = (float)(N_s_a_sprime[st][ac][s2])/N_s_a[st][ac]

		elif(algo == "mybest"):
			if(samples%10000<50):
				state_dist = np.zeros((mdp.numStates))
				state_dist[start_state] = 1
			N = getSampleCount(state_dist, N_s_a_sprime, QupperMBAE[policy1Index], QlowerMBAE[policy1Index], QstarMBAE[policy1Index])
			# print N
			for i in range(N):
				# print state_dist, samples, P_s_a_sprime
				# import pdb; pdb.set_trace()
				st = np.random.choice(np.arange(mdp.numStates), p=state_dist)
				ac = fixedPolicy[st]
				ss, rr = mdp.simulate(st, ac)
				# print "Sampling ", st, ac
				samples += 1
				R_s_a[st][ac] = (rr + R_s_a[st][ac]*N_s_a[st][ac])/(N_s_a[st][ac]+1)
				N_s_a[st][ac] += 1
				N_s_a_sprime[st][ac][ss] += 1
				for s2 in range(mdp.numStates):
					P_s_a_sprime[st][ac][s2] = (float)(N_s_a_sprime[st][ac][s2])/N_s_a[st][ac]
			state_dist = prob_step(state_dist, P_s_a_sprime, fixedPolicy)

		if (samples%1000)<100:
			if(QupperMBAE[policy1Index][start_state]-QlowerMBAE[policy1Index][start_state]-epsilon*(1-mdp.discountFactor)/2<0):
				print(Qupper[policy1Index][start_state],Qstar[policy1Index][start_state],epsilon*(1-mdp.discountFactor)/2)
				print("Epsilon condition reached at ",samples, " samples")
				return fixedPolicy
			else:
				# print QupperMBAE[policy2Index][start_state],QstarMBAE[policy1Index][start_state],epsilon*(1-mdp.discountFactor)/2
				pass
			# print "ends here"
	print(mdp.numStates, mdp.numActions)
	# plt.plot(1 + np.arange(MAX_ITERATION_LIMIT//2)[mdp.numStates * mdp.numActions + 500:], V_error[mdp.numStates * mdp.numActions + 500: MAX_ITERATION_LIMIT//2]/ V_true[start_state])
	# plt.title('Uniform Sampling')
	# plt.xlabel('samples')
	# plt.ylabel('Error fraction in value function')
	# plt.show()

	print(algo, " ", V_true)
	print(algo, " ", V_estimate)
	ff.close()
	return V_error/V_true[start_state]
Пример #2
0
def FeichterPolicy(mdp, start_state=0, epsilon=1, randomseed=None, delta=0.1):
    global c
    if (randomseed is not None):
        np.random.seed(randomseed)
    # orig_stdout = sys.stdout
    # f = open('Fiechter-m01.txt', 'w')
    # sys.stdout = f

    ##### Initialisation
    print(mdp.Vmax, 6 / epsilon, mdp.discountFactor)
    H = int((math.log(mdp.Vmax) + math.log(6.0 / epsilon)) /
            (1 - mdp.discountFactor))

    print("Chosen value of H is : ", H)
    N_h_s_a = np.zeros((H, mdp.numStates, mdp.numActions))
    N_h_s_a_s_prime = np.zeros(
        (H, mdp.numStates, mdp.numActions, mdp.numStates), dtype=np.int)
    rewards_s_a_sprime = np.zeros(
        (mdp.numStates, mdp.numActions, mdp.numStates))
    R_s_a = np.zeros((mdp.numStates, mdp.numActions))
    P_h_s_a_s_prime = np.zeros(
        (H, mdp.numStates, mdp.numActions, mdp.numStates))
    P_tilda = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates))
    P_lower_tilda = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates))
    policy_h_s = np.zeros((H, mdp.numStates), dtype=np.int)
    d_h_policy_s = np.zeros((H + 1, mdp.numStates))
    dmax = 12 * mdp.Vmax / (epsilon * (1 - mdp.discountFactor))
    converge_iterations = 10000
    epsilon_convergence = 1e-4

    Qlower = np.zeros((mdp.numStates, mdp.numActions))
    QlowerMBAE = np.zeros((mdp.numStates, mdp.numActions))
    QupperMBAE = mdp.Vmax * np.ones((mdp.numStates, mdp.numActions))
    Qupper = mdp.Vmax * np.random.random([mdp.numStates, mdp.numActions])
    QstarMBAE = (mdp.Vmax / 2) * np.ones((mdp.numStates, mdp.numActions))
    Qstar = (mdp.Vmax / 2) * np.ones((mdp.numStates, mdp.numActions))
    VupperMBAE = mdp.Vmax * np.ones((mdp.numStates))
    Vlower = np.zeros((mdp.numStates))
    VlowerMBAE = np.zeros((mdp.numStates))
    Vstar = (mdp.Vmax / 2) * np.ones((mdp.numStates))
    Vupper = mdp.Vmax * np.random.random([mdp.numStates])
    sampled_frequency_s_a = np.zeros((mdp.numStates, mdp.numActions))
    N_s_a_sprime = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates))
    it = 0
    samples = 0
    initial_iterations = 1 * mdp.numStates * mdp.numActions

    ### Initial sampling for all state action pairs
    while it < initial_iterations:
        for state in range(mdp.numStates):
            for act in range(mdp.numActions):
                it += 1
                s_prime, r = mdp.simulate(state, act)
                rewards_s_a_sprime[state][act][s_prime] += r
                R_s_a[state][act] = (r + R_s_a[state][act] *
                                     sampled_frequency_s_a[state][act]) / (
                                         sampled_frequency_s_a[state][act] + 1)
                sampled_frequency_s_a[state][act] += 1
                N_s_a_sprime[state][act][s_prime] += 1

    #### For starting the while loop below
    iteration = 1

    if (verbose == 0):
        outp = open(mdp.filename + '-fiechter' + str(randomseed) + '.txt',
                    'wb')
    # sys.stdout = open(mdp.filename+'-fiechter.txt', 'w+')
    ff = open(mdp.filename + '-fiechter-samples.txt', 'w+')

    #### Exploration
    # while d_h_policy_s[0][start_state]>2/(1-mdp.discountFactor) or iteration==1:
    acList = bestTwoActions(mdp, start_state, Qlower, Qupper, Qstar)
    coll = Qupper[start_state][acList[1]] - Qlower[start_state][
        acList[0]] - epsilon * (1 - mdp.discountFactor) / 2
    while coll > 0 or iteration < 50:
        # print d_h_policy_s[0][start_state], " > ", 2/(1-mdp.discountFactor)
        # print policy_h_s[0]
        h = 0
        current_state = start_state
        while h < H:
            current_action = policy_h_s[h][current_state]
            # print "------>",current_state, current_action
            s_prime, r = mdp.simulate(current_state, current_action)
            N_h_s_a[h][current_state][current_action] += 1
            rewards_s_a_sprime[current_state][current_action][s_prime] += r
            R_s_a[state][act] = (
                r + R_s_a[state][act] * sampled_frequency_s_a[state][act]) / (
                    sampled_frequency_s_a[state][act] + 1)
            N_h_s_a_s_prime[h][current_state][current_action][s_prime] += 1
            N_s_a_sprime[current_state][current_action][s_prime] += 1
            sampled_frequency_s_a[current_state][current_action] += 1
            for s2 in range(mdp.numStates):
                P_h_s_a_s_prime[h][current_state][current_action][
                    s2] = N_h_s_a_s_prime[h][current_state][current_action][
                        s2] / N_h_s_a[h][current_state][current_action]
            h += 1
            current_state = s_prime
            samples += 1
            if (samples % 100 == 0):
                acList = bestTwoActions(mdp, start_state, QlowerMBAE,
                                        QupperMBAE, QstarMBAE)
                if (verbose == 0):
                    outp.write(str(samples))
                    outp.write('\t')
                    outp.write(
                        str(QupperMBAE[start_state][acList[1]] -
                            QlowerMBAE[start_state][acList[0]])
                    )  #-epsilon*(1-mdp.discountFactor)/2
                    outp.write('\n')
                else:
                    print(Qupper[start_state], Qlower[start_state])
                    # print d_h_policy_s[0][start_state]-2/(1-mdp.discountFactor)
                    # print samples, (QupperMBAE[start_state][acList[1]]-QlowerMBAE[start_state][acList[0]])-epsilon*(1-mdp.discountFactor)/2
                np.savetxt(ff, sampled_frequency_s_a, delimiter=',')
                ff.write('\n')
                # print samples, d_h_policy_s[0][start_state]-2/(1-mdp.discountFactor)

        # Compute new policy dynamic program
        e_s_a = np.zeros((mdp.numStates, mdp.numActions))
        for h in range(H - 1, -1, -1):
            for state in range(mdp.numStates):
                current_max = -float("inf")
                argmax_action = -1
                for act in range(mdp.numActions):
                    if (N_h_s_a[h][state][act] == 0):
                        e_s_a[state][act] = dmax
                    else:
                        sqterm = (2 * math.log(
                            4 * H * mdp.numStates * mdp.numActions) -
                                  2 * math.log(delta)) / N_h_s_a[h][state][act]
                        summation = np.sum(
                            (N_h_s_a_s_prime[h][state][act] /
                             N_h_s_a[h][state][act]) * d_h_policy_s[h + 1])
                        secondterm = mdp.discountFactor * summation
                        e_s_a[state][act] = min(
                            dmax, 6 * mdp.Vmax * (math.sqrt(sqterm)) /
                            (epsilon * (1 - delta)) + secondterm)

                policy_h_s[h][state] = np.argmax(e_s_a[state])
                d_h_policy_s[h][state] = np.amax(e_s_a[state])

        # Compute MBAE QupperMBAE and QlowerMBAE bounds
        for internal in range(converge_iterations):
            oldQlowerMBAE = np.copy(QlowerMBAE[start_state])
            for state in range(mdp.numStates):
                for act in range(mdp.numActions):
                    # Calculations for QupperMBAE and QlowerMBAE
                    # Calculations for QupperMBAE and QlowerMBAE
                    firstterm = np.sum(rewards_s_a_sprime[state][act]
                                       ) / sampled_frequency_s_a[state][act]
                    secondterm = mdp.discountFactor * np.sum(
                        VupperMBAE * (N_s_a_sprime[state][act] /
                                      sampled_frequency_s_a[state][act]))
                    #secondterm = mdp.discountFactor*sum(VupperMBAE[ss]*N_s_a_sprime[state][act][ss]/sampled_frequency_s_a[state][act] for ss in range(mdp.numStates))
                    lower_secondterm = mdp.discountFactor * np.sum(
                        VlowerMBAE * (N_s_a_sprime[state][act] /
                                      sampled_frequency_s_a[state][act]))
                    star_secondterm = mdp.discountFactor * np.sum(
                        Vstar * (N_s_a_sprime[state][act] /
                                 sampled_frequency_s_a[state][act]))
                    #lower_secondterm = mdp.discountFactor*sum(VlowerMBAE[ss]*N_s_a_sprime[state][act][ss]/sampled_frequency_s_a[state][act] for ss in range(mdp.numStates))
                    thirdterm = mdp.Vmax * math.sqrt(
                        (math.log(c * (samples**2) * mdp.numStates *
                                  mdp.numActions) - math.log(delta)) /
                        sampled_frequency_s_a[state][act])
                    #QupperMBAE[state][act] = (float)(sum(rewards_s_a_sprime[state][act][ss] for ss in range(mdp.numStates))/sampled_frequency_s_a[state][act]) + secondterm + thirdterm
                    QupperMBAE[state][act] = firstterm + secondterm + thirdterm
                    QlowerMBAE[state][
                        act] = firstterm + lower_secondterm - thirdterm
                    QstarMBAE[state][act] = firstterm + star_secondterm
                    # Calculation for Vstar
                    # t = (float)N_s_a_sprime[state][act][stateprime]/sampled_frequency_s_a[state][act]
                    # val = t*(rewards_s_a[state][act][stateprime]+mdp.discountFactor*Vstar[stateprime])
                VupperMBAE[state] = np.amax(QupperMBAE[state])
                VlowerMBAE[state] = np.amax(QlowerMBAE[state])
                Vstar[state] = np.amax(QstarMBAE[state])
            if (np.linalg.norm(oldQlowerMBAE - QlowerMBAE[start_state]) <=
                    epsilon_convergence):
                break

        for i in range(mdp.numStates):
            for j in range(mdp.numActions):
                if (sampled_frequency_s_a[i][j] > 0):
                    P_tilda[i][j] = UpperP(i, j, delta, N_s_a_sprime[i][j],
                                           mdp.numStates, Vupper, False)
                    P_lower_tilda[i][j] = LowerP(i, j, delta,
                                                 N_s_a_sprime[i][j],
                                                 mdp.numStates, Vlower, False)

        Qupper, Vupper = iteratedConvergence(Qupper, R_s_a, P_tilda,
                                             mdp.discountFactor, epsilon,
                                             converge_iterations,
                                             epsilon_convergence)
        Qlower, Vlower = iteratedConvergence(Qlower, R_s_a, P_lower_tilda,
                                             mdp.discountFactor, epsilon,
                                             converge_iterations,
                                             epsilon_convergence)
        # Qstar, _ = iteratedConvergence(Qstar,R_s_a,P_,mdp.discountFactor, epsilon, converge_iterations, epsilon_convergence)

        iteration += 1
        acList = bestTwoActions(mdp, start_state, QlowerMBAE, QupperMBAE,
                                QstarMBAE)
        coll = QupperMBAE[start_state][acList[1]] - QlowerMBAE[start_state][
            acList[0]] - epsilon * (1 - mdp.discountFactor) / 2
    # sys.stdout = orig_stdout
    # f.close()
    print(iteration)
    a = open('final' + mdp.filename + '-fiechter.txt', 'a+')
    a.write(str(iteration) + '\n')
    a.close()
    return getBestPolicy(mdp, rewards_s_a_sprime, P_h_s_a_s_prime[0])
    # return policy_h_s[0]
Пример #3
0
def mbie(mdp, start_state=0, epsilon=4, randomseed=None, delta=0.1):

	global c
	if(randomseed is not None):
		np.random.seed(randomseed)
	initial_iterations = 1*mdp.numStates*mdp.numActions
	### Estimate the horizon based on Fiechter
	H = int((math.log(mdp.Vmax) + math.log(6.0/epsilon))/(1-mdp.discountFactor))
	it=0
	samples = 0

	### Calculating m based on the parameters

	first_term = mdp.numStates/(epsilon**2*(1-mdp.discountFactor)**4)
	second_term = math.log(mdp.numStates*mdp.numActions/(epsilon*(1-mdp.discountFactor)*delta))/(epsilon**2*(1-mdp.discountFactor)**4)
	m = c*(first_term+second_term)
	print "Chosen value of m is :",H, m
	N_s_a = np.zeros((mdp.numStates,mdp.numActions))
	N_s_a_sprime = np.zeros((mdp.numStates,mdp.numActions,mdp.numStates))
	P_s_a_sprime = np.zeros((mdp.numStates,mdp.numActions,mdp.numStates))
	P_tilda = np.zeros((mdp.numStates,mdp.numActions,mdp.numStates))
	P_lower_tilda = np.zeros((mdp.numStates,mdp.numActions,mdp.numStates))
	R_s_a = np.zeros((mdp.numStates,mdp.numActions))
	Qupper = mdp.Vmax*np.random.random([mdp.numStates,mdp.numActions])
	QupperMBAE = mdp.Vmax*np.ones((mdp.numStates,mdp.numActions))
	Qlower = np.zeros((mdp.numStates,mdp.numActions))
	QlowerMBAE = np.zeros((mdp.numStates,mdp.numActions))
	Qstar = (mdp.Vmax/2)*np.ones((mdp.numStates,mdp.numActions))
	Vupper = mdp.Vmax*np.random.random([mdp.numStates])
	VupperMBAE = mdp.Vmax*np.ones((mdp.numStates))
	Vlower = np.zeros((mdp.numStates))
	VlowerMBAE = np.zeros((mdp.numStates))
	Vstar = (mdp.Vmax/2)*np.ones((mdp.numStates))
	best_policy = (-1)*np.ones((mdp.numStates), dtype=np.int)

	### Initial sampling for all state action pairs
	while it < initial_iterations:
		for state in range(mdp.numStates):
			for act in range(mdp.numActions):
				it+=1
				ss, rr = mdp.simulate(state, act)
				R_s_a[state][act] = rr
				N_s_a[state][act] += 1
				N_s_a_sprime[state][act][ss] += 1
				# P_s_a_sprime = np.copy(N_s_a_sprime)
				for s2 in range(mdp.numStates):
					P_s_a_sprime[state][act][s2] = (float)(N_s_a_sprime[state][act][s2])/N_s_a[state][act]

	samples += initial_iterations
	print P_s_a_sprime
	print "Completed initial iterations"
	Qupper, Vupper = iteratedConvergence(Qupper,R_s_a,P_s_a_sprime,mdp.discountFactor, epsilon, converge_iterations, epsilon_convergence)
	print Qupper, "Qupper"
	# print Qupper, Vupper
	current_state = start_state
	### Repeat forever

	if(verbose==0):
		outp = open(mdp.filename+'-mbie' + str(randomseed) +'.txt', 'wb')
	# sys.stdout = open(mdp.filename+'-mbie.txt', 'w+')
	ff = open(mdp.filename+'-mbie-samples.txt', 'w+')

	while samples<MAX_ITERATION_LIMIT:
		current_state = start_state
		h=1
		# print Qupper[start_state], Qstar[start_state], Qlower[start_state]
		while h<=H:
			if(samples%100==0):
				acList = bestTwoActions(mdp, start_state, QlowerMBAE, QupperMBAE, Qstar)
				if(verbose==0):
					outp.write(str(samples))
					outp.write('\t')
					outp.write(str(QupperMBAE[start_state][acList[1]]-QlowerMBAE[start_state][acList[0]]))#-epsilon*(1-mdp.discountFactor)/2 
					outp.write('\n')
					print samples, (QupperMBAE[start_state][acList[1]]-QlowerMBAE[start_state][acList[0]]) 
				else:
					print samples, (QupperMBAE[start_state][acList[1]],QlowerMBAE[start_state][acList[0]]) 
					pass
				np.savetxt(ff, N_s_a, delimiter=',')
				ff.write('\n')
			for i in range(mdp.numStates):
				# print "For state ", i, " doing UpperP"
				for j in range(mdp.numActions):
					P_tilda[i][j] = UpperP(i,j,delta,N_s_a_sprime[i][j],mdp.numStates,Vupper,False)
					P_lower_tilda[i][j] = LowerP(i,j,delta,N_s_a_sprime[i][j],mdp.numStates,Vlower,False)
			# print "Starting iterating"
			# print Qupper
			# return 2
			Qupper, Vupper = iteratedConvergence(Qupper,R_s_a,P_tilda,mdp.discountFactor,epsilon, converge_iterations, epsilon_convergence)
			Qlower, Vlower = iteratedConvergence(Qlower,R_s_a,P_lower_tilda,mdp.discountFactor, epsilon, converge_iterations, epsilon_convergence)
			current_action = np.argmax(QupperMBAE[current_state])
			# print Qupper[start_state], Qlower[start_state]
			best_policy[current_state] = current_action
			if(N_s_a[current_state][current_action]<m):
				for t in range(1):
					ss,rr = mdp.simulate(current_state, current_action)
					R_s_a[current_state][current_action] = (rr + R_s_a[current_state][current_action]*N_s_a[current_state][current_action])/(N_s_a[current_state][current_action]+1)
					N_s_a[current_state][current_action] += 1
					N_s_a_sprime[current_state][current_action][ss] += 1
					samples += 1
				for s2 in range(mdp.numStates):
					# print current_state, current_action, s2, N_s_a_sprime[current_state][current_action][s2], N_s_a[current_state][current_action]
					P_s_a_sprime[state][act][s2] = (float)(N_s_a_sprime[state][act][s2])/N_s_a[state][act]
				current_state = ss

			else:
				print "TRUEEEE"
				print N_s_a[current_state]
				# print P_s_a_sprime[current_state][current_action]
				# print np.sum(P_s_a_sprime[current_state][current_action])
				# print N_s_a[current_state][current_action]
				current_state = np.random.choice(np.arange(mdp.numStates), p=P_s_a_sprime[current_state][current_action]/np.sum(P_s_a_sprime[current_state][current_action]))

		
			h += 1
		
		# Compute MBAE Qupper and Qlower bounds
		for internal in range(converge_iterations):
			oldQlower = np.copy(QlowerMBAE[start_state])
			for state in range(mdp.numStates):
				for act in range(mdp.numActions):
					# Calculations for Qupper and Qlower mBAE
					firstterm = R_s_a[state][act]
					secondterm = mdp.discountFactor*np.sum(VupperMBAE*(N_s_a_sprime[state][act]/N_s_a[state][act]))
					#secondterm = mdp.discountFactor*sum(Vupper[ss]*N_s_a_sprime[state][act][ss]/N_s_a[state][act] for ss in range(mdp.numStates))  
					lower_secondterm = mdp.discountFactor*np.sum(VlowerMBAE*(N_s_a_sprime[state][act]/N_s_a[state][act]))
					star_secondterm = mdp.discountFactor*np.sum(Vstar*(N_s_a_sprime[state][act]/N_s_a[state][act]))
					#lower_secondterm = mdp.discountFactor*sum(Vlower[ss]*N_s_a_sprime[state][act][ss]/N_s_a[state][act] for ss in range(mdp.numStates))  
					thirdterm = mdp.Vmax*math.sqrt((math.log(c*(samples**2)*mdp.numStates*mdp.numActions)-math.log(delta))/N_s_a[state][act])
					#Qupper[state][act] = (float)(sum(rewards_s_a_sprime[state][act][ss] for ss in range(mdp.numStates))/N_s_a[state][act]) + secondterm + thirdterm
					QupperMBAE[state][act] = firstterm + secondterm + thirdterm
					# print firstterm, secondterm, thirdterm
					QlowerMBAE[state][act] = firstterm + lower_secondterm - thirdterm
					Qstar[state][act] = firstterm + star_secondterm
					# Calculation for Vstar
					# t = (float)N_s_a_sprime[state][act][stateprime]/N_s_a[state][act]
					# val = t*(rewards_s_a[state][act][stateprime]+mdp.discountFactor*Vstar[stateprime])
				VupperMBAE[state] = np.amax(QupperMBAE[state])
				VlowerMBAE[state] = np.amax(QlowerMBAE[state])
				Vstar[state] = np.amax(Qstar[state])
			if(np.linalg.norm(oldQlower-QlowerMBAE[start_state])<=epsilon_convergence):
				# print "Stopping with ", internal, "iterations"
				break		


	return best_policy
Пример #4
0
def LUCBEpisodic(mdp,
                 start_state=0,
                 epsilon=4,
                 randomseed=None,
                 delta=0.1,
                 fileprint=1):
    if (randomseed is not None):
        np.random.seed(randomseed)
    global MAX_ITERATION_LIMIT, c
    iteration = 0
    it = 0
    H = int((math.log(mdp.Vmax) + math.log(6.0 / epsilon)) /
            (1 - mdp.discountFactor))
    initial_iterations = 1 * mdp.numStates * mdp.numActions
    rewards_s_a_sprime = np.zeros(
        (mdp.numStates, mdp.numActions, mdp.numStates))
    R_s_a = np.zeros((mdp.numStates, mdp.numActions))
    sampled_frequency_s_a = np.zeros((mdp.numStates, mdp.numActions))
    N_s_a_sprime = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates))
    P = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates))
    P_tilda = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates))
    P_lower_tilda = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates))
    VlowerMBAE = np.zeros((mdp.numStates))
    Vlower = np.zeros((mdp.numStates))
    Vstar = (mdp.Vmax / 2) * np.ones((mdp.numStates))
    Vupper = mdp.Vmax * np.random.random([mdp.numStates])
    Qlower = np.zeros((mdp.numStates, mdp.numActions))
    VupperMBAE = mdp.Vmax * np.ones((mdp.numStates))
    QlowerMBAE = np.zeros((mdp.numStates, mdp.numActions))
    Qstar = (mdp.Vmax / 2) * np.ones((mdp.numStates, mdp.numActions))
    QupperMBAE = mdp.Vmax * np.ones((mdp.numStates, mdp.numActions))
    Qupper = mdp.Vmax * np.random.random([mdp.numStates, mdp.numActions])
    final_policy = (-1) * np.ones((mdp.numStates), dtype=np.int)
    states_to_sample = range(mdp.numStates)
    colliding_values = np.zeros((mdp.numStates))
    is_converged = 0
    print "Vmax", mdp.Vmax
    print "Epsilon is ", epsilon

    ### Initial sampling for all state action pairs
    while it < initial_iterations:
        for state in range(mdp.numStates):
            for act in range(mdp.numActions):
                it += 1
                s_prime, r = mdp.simulate(state, act)
                rewards_s_a_sprime[state][act][s_prime] += r
                R_s_a[state][act] = (r + R_s_a[state][act] *
                                     sampled_frequency_s_a[state][act]) / (
                                         sampled_frequency_s_a[state][act] + 1)
                sampled_frequency_s_a[state][act] += 1
                N_s_a_sprime[state][act][s_prime] += 1
                for s2 in range(mdp.numStates):
                    P[state][act][s2] = (float)(
                        N_s_a_sprime[state][act]
                        [s2]) / sampled_frequency_s_a[state][act]

    ### Calculating V, Q estimates thus far MBAE
    for internal in range(converge_iterations):
        oldQlowerMBAE = np.copy(QlowerMBAE[start_state])
        for state in range(mdp.numStates):
            for act in range(mdp.numActions):
                # Calculations for QupperMBAE and QlowerMBAE
                firstterm = np.sum(rewards_s_a_sprime[state]
                                   [act]) / sampled_frequency_s_a[state][act]
                secondterm = mdp.discountFactor * np.sum(
                    VupperMBAE * (N_s_a_sprime[state][act] /
                                  sampled_frequency_s_a[state][act]))
                lower_secondterm = mdp.discountFactor * np.sum(
                    VlowerMBAE * (N_s_a_sprime[state][act] /
                                  sampled_frequency_s_a[state][act]))
                star_secondterm = mdp.discountFactor * np.sum(
                    Vstar * (N_s_a_sprime[state][act] /
                             sampled_frequency_s_a[state][act]))
                thirdterm = mdp.Vmax * math.sqrt(
                    (math.log(c * mdp.numStates * mdp.numActions) -
                     math.log(delta)) / sampled_frequency_s_a[state][act])
                QupperMBAE[state][act] = firstterm + secondterm + thirdterm
                QlowerMBAE[state][
                    act] = firstterm + lower_secondterm - thirdterm
                Qstar[state][act] = firstterm + star_secondterm
            VupperMBAE[state] = np.amax(QupperMBAE[state])
            VlowerMBAE[state] = np.amax(QlowerMBAE[state])
            Vstar[state] = np.amax(Qstar[state])

        if (np.linalg.norm(oldQlowerMBAE - QlowerMBAE[start_state]) <=
                epsilon_convergence):
            print "Stopping with ", internal, "initial internal iterations"
            break

    if internal == converge_iterations:
        print "Used all iterations"

    print "Initial estimate of QupperMBAE found! Now sampling"

    Qupper = np.copy(QupperMBAE)
    Qlower = np.copy(QlowerMBAE)

    if (verbose == 0):
        outp = open(mdp.filename + '-lucbeps' + str(randomseed) + '.txt', 'wb')
    ff = open(mdp.filename + '-lucbeps-samples.txt', 'w+')

    h = 0
    state1 = start_state

    iteration += initial_iterations

    while iteration < MAX_ITERATION_LIMIT:
        max_collision_state = [
            sorted(states_to_sample,
                   key=lambda x: colliding_values[x],
                   reverse=True)[0]
        ]
        if (h % H == 0):
            state1 = start_state
            h = 0
        else:
            state1 = nextstate
        actionsList = bestTwoActions(mdp, state1, QlowerMBAE, QupperMBAE,
                                     Qstar)
        a = np.random.choice(actionsList)
        iteration += 1

        for t in range(1):
            s_prime, r = mdp.simulate(state1, a)
            nextstate = s_prime
            rewards_s_a_sprime[state1][a][s_prime] += r
            R_s_a[state1][act] = (r + R_s_a[state1][act] *
                                  sampled_frequency_s_a[state1][act]) / (
                                      sampled_frequency_s_a[state1][act] + 1)
            sampled_frequency_s_a[state1][a] += 1
            N_s_a_sprime[state1][a][s_prime] += 1
            if (verbose == 1):
                pass
                # print "s, a, sprime"
                # print state1, a, s_prime
            for s2 in range(mdp.numStates):
                P[state1][act][s2] = (float)(
                    N_s_a_sprime[state1][act]
                    [s2]) / sampled_frequency_s_a[state1][act]

        ## Calculating Q and V values
        for i in range(mdp.numStates):
            for j in range(mdp.numActions):
                if (sampled_frequency_s_a[i][j] > 0):
                    P_tilda[i][j] = UpperP(i, j, delta, N_s_a_sprime[i][j],
                                           mdp.numStates, Vupper, False)
                    P_lower_tilda[i][j] = LowerP(i, j, delta,
                                                 N_s_a_sprime[i][j],
                                                 mdp.numStates, Vlower, False)

        if (verbose == 1):
            pass

        Qupper, Vupper = iteratedConvergence(Qupper, R_s_a, P_tilda,
                                             mdp.discountFactor, epsilon,
                                             converge_iterations,
                                             epsilon_convergence)
        Qlower, Vlower = iteratedConvergence(Qlower, R_s_a, P_lower_tilda,
                                             mdp.discountFactor, epsilon,
                                             converge_iterations,
                                             epsilon_convergence)

        if (verbose == 1):
            # print "Calculated Q values are :"
            print QupperMBAE[start_state], Qstar[start_state], QlowerMBAE[
                start_state]

        # Calculations for QupperMBAE and QlowerMBAE
        #### This involved a two for-loop and iterating convergence
        for internal in range(converge_iterations):
            oldQlowerMBAE = np.copy(QlowerMBAE[start_state])
            for state in range(mdp.numStates):
                for act in range(mdp.numActions):
                    # Calculations for QupperMBAE and QlowerMBAE
                    firstterm = np.sum(rewards_s_a_sprime[state][act]
                                       ) / sampled_frequency_s_a[state][act]
                    secondterm = mdp.discountFactor * np.sum(
                        VupperMBAE * (N_s_a_sprime[state][act] /
                                      sampled_frequency_s_a[state][act]))
                    lower_secondterm = mdp.discountFactor * np.sum(
                        VlowerMBAE * (N_s_a_sprime[state][act] /
                                      sampled_frequency_s_a[state][act]))
                    star_secondterm = mdp.discountFactor * np.sum(
                        Vstar * (N_s_a_sprime[state][act] /
                                 sampled_frequency_s_a[state][act]))
                    thirdterm = mdp.Vmax * math.sqrt(
                        (math.log(c * (iteration**2) * mdp.numStates *
                                  mdp.numActions) - math.log(delta)) /
                        sampled_frequency_s_a[state][act])
                    QupperMBAE[state][act] = firstterm + secondterm + thirdterm
                    QlowerMBAE[state][
                        act] = firstterm + lower_secondterm - thirdterm
                    Qstar[state][act] = firstterm + star_secondterm
                VupperMBAE[state] = np.amax(QupperMBAE[state])
                VlowerMBAE[state] = np.amax(QlowerMBAE[state])
                Vstar[state] = np.amax(Qstar[state])
            if (np.linalg.norm(oldQlowerMBAE - QlowerMBAE[start_state]) <=
                    epsilon_convergence):
                break

        count = 0
        if (iteration % 100 == 0):
            acList = bestTwoActions(mdp, start_state, Qlower, Qupper, Qstar)
            # print "Qupper, Qstar, Qlower"
            # print Qupper[start_state], Qstar[start_state], Qlower[start_state]
            if (verbose == 0):
                outp.write(str(iteration))
                outp.write('\t')
                outp.write(
                    str(QupperMBAE[start_state][acList[1]] -
                        QlowerMBAE[start_state][acList[0]])
                )  #-epsilon*(1-mdp.discountFactor)/2
                outp.write('\n')
            else:
                print iteration, QupperMBAE[start_state][
                    acList[1]] - QlowerMBAE[start_state][acList[0]]
            np.savetxt(ff, sampled_frequency_s_a, delimiter=',')
            ff.write('\n')

        ##### Updating the list of coliliding states
        if (iteration > 50):
            states_to_sample = []
            for st in range(mdp.numStates):
                acList = bestTwoActions(mdp, st, QlowerMBAE, QupperMBAE, Qstar)
                ##### Changing stopping condition to epsilon*(1-gamma)/2
                colliding_values[st] = QupperMBAE[st][acList[1]] - QlowerMBAE[
                    st][acList[0]] - epsilon * (1 - mdp.discountFactor) / 2
                if (colliding_values[st] > 0):
                    ### this state is still colliding, add to sample states
                    states_to_sample.append(st)
        else:
            # for st in range(mdp.numStates):
            # 	acList = bestTwoActions(mdp, st, Qlower, Qupper, Qstar)
            # 	colliding_values[st] = Qupper[st][acList[1]]-Qlower[st][acList[0]]-epsilon*(1-mdp.discountFactor)/2
            colliding_values = range(mdp.numStates)
            states_to_sample = range(mdp.numStates)

        #### Check epsilon condition for only starting state
        if (not (start_state in states_to_sample) and iteration > 50):
            # if(count==mdp.numStates):
            acList = bestTwoActions(mdp, start_state, QlowerMBAE, QupperMBAE,
                                    Qstar)
            print "Difference is ", Qupper[st][acList[1]] - Qlower[st][
                acList[0]]
            print "Setting final_policy of ", start_state, " to", acList[0]
            final_policy[start_state] = acList[0]
            print "Iterations taken : ", iteration
            for i in range(mdp.numStates):
                if (final_policy[i] == -1):
                    final_policy[i] = bestTwoActions(mdp, i, QlowerMBAE,
                                                     QupperMBAE, Qstar)[0]
            print "Returning policy : ", final_policy

            if (iteration != 51):
                a = open('final' + mdp.filename + '-lucbeps.txt', 'a+')
                a.write(str(iteration) + '\n')
                a.close()
            return final_policy

        h += 1

    outp.close()
    ff.close()

    for i in range(mdp.numStates):
        if (final_policy[i] == -1):
            final_policy[i] = bestTwoActions(mdp, i, QlowerMBAE, QupperMBAE,
                                             Qstar)[0]
    return final_policy
Пример #5
0
def ddvouu(mdp, start_state=0, epsilon=4, randomseed=None, delta=0.1):

    if (randomseed is not None):
        np.random.seed(randomseed)

    initial_iterations = 1 * mdp.numStates * mdp.numActions
    ### Estimate the horizon based on Fiechter

    c = 1
    it = 0
    samples = 0

    first_term = mdp.numStates / (epsilon**2 * (1 - mdp.discountFactor)**4)
    second_term = math.log(
        mdp.numStates * mdp.numActions /
        (epsilon *
         (1 - mdp.discountFactor) * delta)) / (epsilon**2 *
                                               (1 - mdp.discountFactor)**4)
    m = c * (first_term + second_term)
    delta = delta / (mdp.numStates * mdp.numActions * m)
    print("Chosen value of m is :", m)
    N_s_a = np.zeros((mdp.numStates, mdp.numActions), dtype=np.int)
    N_s_a_sprime = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates),
                            dtype=np.int)
    P_s_a_sprime = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates))
    P_tilda = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates))
    P_lower_tilda = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates))
    R_s_a = np.zeros((mdp.numStates, mdp.numActions))
    Qupper = mdp.Vmax * np.ones((mdp.numStates, mdp.numActions))
    QupperMBAE = mdp.Vmax * np.ones((mdp.numStates, mdp.numActions))
    Qlower = np.zeros((mdp.numStates, mdp.numActions))
    Qstar = (mdp.Vmax / 2) * np.ones((mdp.numStates, mdp.numActions))
    QlowerMBAE = np.zeros((mdp.numStates, mdp.numActions))
    Vupper = mdp.Vmax * np.ones((mdp.numStates))
    VupperMBAE = mdp.Vmax * np.ones((mdp.numStates))
    Vlower = np.zeros((mdp.numStates))
    VlowerMBAE = np.zeros((mdp.numStates))
    Vstar = (mdp.Vmax / 2) * np.ones((mdp.numStates))
    best_policy = (-1) * np.ones((mdp.numStates), dtype=np.int)
    deltadeltaV = np.zeros((mdp.numStates, mdp.numActions))
    discovered_states = set([start_state, 1, 2, 3, 4])

    ## Initial sampling for all state action pairs
    ### Is this needed?
    while it < initial_iterations:
        for state in range(mdp.numStates):
            for act in range(mdp.numActions):
                it += 1

                ss, rr = mdp.simulate(state, act)
                print("Sampling ", state, act, rr, ss)
                R_s_a[state][act] = (rr + R_s_a[state][act] * N_s_a[state][act]
                                     ) / (N_s_a[state][act] + 1)
                N_s_a[state][act] += 1
                N_s_a_sprime[state][act][ss] += 1
                # P_s_a_sprime = np.copy(N_s_a_sprime)
                for s2 in range(mdp.numStates):
                    P_s_a_sprime[state][act][s2] = (float)(
                        N_s_a_sprime[state][act][s2]) / N_s_a[state][act]
    samples += initial_iterations

    print(P_s_a_sprime)
    print("Completed initial iterations")

    if (verbose == 0):
        outp = open(mdp.filename + '-ddv' + str(randomseed) + '.txt', 'wb')
    # sys.stdout = open(mdp.filename+'-ddv.txt', 'w+')
    ff = open(mdp.filename + '-ddv-samples.txt', 'w+')

    # print Qupper, Vupper
    current_state = start_state
    ### Repeat forever
    while samples < MAX_ITERATION_LIMIT:
        # print Qupper[start_state], Qlower[start_state]
        for i in range(mdp.numStates):
            for j in range(mdp.numActions):
                if (N_s_a[i][j] > 0):
                    P_tilda[i][j] = UpperP(i, j, delta, N_s_a_sprime[i][j],
                                           mdp.numStates, Vupper, False)
                    P_lower_tilda[i][j] = LowerP(i, j, delta,
                                                 N_s_a_sprime[i][j],
                                                 mdp.numStates, Vlower, False)

        ##Calculate Q values
        Qupper, Vupper = iteratedConvergence(Qupper, R_s_a, P_tilda,
                                             mdp.discountFactor, epsilon,
                                             converge_iterations,
                                             epsilon_convergence)
        Qlower, Vlower = iteratedConvergence(Qlower, R_s_a, P_lower_tilda,
                                             mdp.discountFactor, epsilon,
                                             converge_iterations,
                                             epsilon_convergence)

        current_state = start_state

        ### Terminating condition
        if (use_mbae):
            acList = bestTwoActions(mdp, start_state, QlowerMBAE, QupperMBAE,
                                    Qstar)
            coll = QupperMBAE[start_state][
                acList[1]] - QlowerMBAE[start_state][
                    acList[0]] - epsilon * (1 - mdp.discountFactor) / 2
        else:
            acList = bestTwoActions(mdp, start_state, Qlower, Qupper, Qstar)
            coll = Qupper[start_state][acList[1]] - Qlower[start_state][
                acList[0]] - epsilon * (1 - mdp.discountFactor) / 2
        # if(Vupper[start_state]-Vlower[start_state]<=epsilon and samples>50):
        if (coll < 0 and samples > 50):
            a = open('final' + mdp.filename + '-ddv.txt', 'a+')
            a.write(str(samples) + '\n')
            a.close()
            print(Qupper[start_state], Vupper[start_state],
                  Vlower[start_state])
            policy_lower = np.argmax(Qlower, axis=1)
            print("Iteration number ", samples)
            print("Returning policy because of epsilon-convergence")
            print(policy_lower)
            print(np.argmax(QupperMBAE, axis=1))
            print(np.argmax(Qupper, axis=1))
            print(np.argmax(QlowerMBAE, axis=1))
            print(np.argmax(Qstar, axis=1))
            return policy_lower

        ## Caclulate deldelV for all states
        if (use_mbae):
            for st in list(discovered_states):
                for ac in range(mdp.numActions):
                    #### Compute del del V
                    deltadeltaV[st][ac] = CalculateDelDelV(
                        st, ac, mdp, N_s_a_sprime, QupperMBAE, QlowerMBAE,
                        VupperMBAE, VlowerMBAE, start_state, P_s_a_sprime,
                        P_tilda, P_lower_tilda, R_s_a, epsilon, delta,
                        converge_iterations, epsilon_convergence)
        else:
            for st in list(discovered_states):
                for ac in range(mdp.numActions):
                    #### Compute del del V
                    deltadeltaV[st][ac] = CalculateDelDelV(
                        st, ac, mdp, N_s_a_sprime, Qupper, Qlower, Vupper,
                        Vlower, start_state, P_s_a_sprime, P_tilda,
                        P_lower_tilda, R_s_a, epsilon, delta,
                        converge_iterations, epsilon_convergence)

        #### Simulate greedily wrt deldelV
        # print np.unravel_index(deltadeltaV.argmax(), deltadeltaV.shape)
        current_state, current_action = np.unravel_index(
            deltadeltaV.argmax(), deltadeltaV.shape)
        #time.sleep(0.1)
        print(deltadeltaV)
        ss, rr = mdp.simulate(current_state, current_action)
        samples += 1
        print("Sampling ", current_state, current_action, rr, ss)

        #### Add received state to the set of discovered states
        #discovered_states.add(ss)
        print(discovered_states)
        ### Update believed model
        R_s_a[current_state][current_action] = (
            rr + R_s_a[current_state][current_action] *
            N_s_a[current_state][current_action]) / (
                N_s_a[current_state][current_action] + 1)
        N_s_a[current_state][current_action] += 1
        N_s_a_sprime[current_state][current_action][ss] += 1

        for s2 in range(mdp.numStates):
            # print current_state, current_action, s2, N_s_a_sprime[current_state][current_action][s2], N_s_a[current_state][current_action]
            P_s_a_sprime[current_state][current_action][s2] = (float)(
                N_s_a_sprime[current_state][current_action]
                [s2]) / N_s_a[current_state][current_action]

        if (samples % 100 == 0):
            if (use_mbae):
                acList = bestTwoActions(mdp, start_state, QlowerMBAE,
                                        QupperMBAE, Qstar)
            else:
                acList = bestTwoActions(mdp, start_state, Qlower, Qupper,
                                        Qstar)
            if (verbose == 0):
                outp.write(str(samples))
                outp.write('\t')
                if (plot_vstar):
                    outp.write(str(Vstar[start_state]))
                else:
                    if (use_mbae):
                        outp.write(
                            str(QupperMBAE[start_state][acList[1]] -
                                QlowerMBAE[start_state][acList[0]]))
                    else:
                        outp.write(
                            str(Qupper[start_state][acList[1]] -
                                Qlower[start_state][acList[0]]))
                outp.write('\n')
                if (use_mbae):
                    print(samples, (QupperMBAE[start_state][acList[1]] -
                                    QlowerMBAE[start_state][acList[0]]))
                else:
                    print(samples, (Qupper[start_state][acList[1]] -
                                    Qlower[start_state][acList[0]]))
            else:
                print(samples, (QupperMBAE[start_state][acList[1]] -
                                QlowerMBAE[start_state][acList[0]]))
            np.savetxt(ff, N_s_a, delimiter=',')
            ff.write('\n')

        ### Calculating MBAE bounds
        for internal in range(converge_iterations):
            oldQlower = np.copy(QlowerMBAE[start_state])
            for state in range(mdp.numStates):
                for act in range(mdp.numActions):
                    # Calculations for Qupper and Qlower
                    firstterm = R_s_a[state][act]
                    secondterm = mdp.discountFactor * np.sum(
                        VupperMBAE *
                        (N_s_a_sprime[state][act] / N_s_a[state][act]))
                    #secondterm = mdp.discountFactor*sum(Vupper[ss]*N_s_a_sprime[state][act][ss]/N_s_a[state][act] for ss in range(mdp.numStates))
                    lower_secondterm = mdp.discountFactor * np.sum(
                        VlowerMBAE *
                        (N_s_a_sprime[state][act] / N_s_a[state][act]))
                    star_secondterm = mdp.discountFactor * np.sum(
                        Vstar * (N_s_a_sprime[state][act] / N_s_a[state][act]))
                    #lower_secondterm = mdp.discountFactor*sum(Vlower[ss]*N_s_a_sprime[state][act][ss]/N_s_a[state][act] for ss in range(mdp.numStates))
                    thirdterm = mdp.Vmax * math.sqrt(
                        (math.log(c * (samples**2) * mdp.numStates *
                                  mdp.numActions) - math.log(delta)) /
                        N_s_a[state][act])
                    #Qupper[state][act] = (float)(sum(rewards_s_a_sprime[state][act][ss] for ss in range(mdp.numStates))/N_s_a[state][act]) + secondterm + thirdterm
                    QupperMBAE[state][act] = firstterm + secondterm + thirdterm
                    QlowerMBAE[state][
                        act] = firstterm + lower_secondterm - thirdterm
                    Qstar[state][act] = firstterm + star_secondterm
                    # Calculation for Vstar
                    # t = (float)N_s_a_sprime[state][act][stateprime]/N_s_a[state][act]
                    # val = t*(rewards_s_a[state][act][stateprime]+mdp.discountFactor*Vstar[stateprime])
                VupperMBAE[state] = np.amax(QupperMBAE[state])
                VlowerMBAE[state] = np.amax(QlowerMBAE[state])
                Vstar[state] = np.amax(Qstar[state])
            if (np.linalg.norm(oldQlower - QlowerMBAE[start_state]) <=
                    epsilon_convergence):
                # print "Stopping with ", internal, "iterations"
                break

        # if(samples==initial_iterations+2):
        # 	Qupper = np.copy(QupperMBAE)
        # 	Qlower = np.copy(QlowerMBAE)

    return best_policy
Пример #6
0
def policyIt(mdp, start_state=0, epsilon=4, randomseed=None, delta=0.1, bounds="MBAE", use_ddv=False, mc = True):

	if(randomseed is not None):
		np.random.seed(randomseed)
	policies = np.array(getPolicies(mdp.numStates, mdp.numActions))
	numPolicies = len(policies)
	counts = np.zeros((numPolicies))
	print(numPolicies)
	#H = int((math.log(mdp.Vmax) + math.log(6.0/epsilon))/(1-mdp.discountFactor))
	H = int(math.log(epsilon/(2*mdp.Vmax*(1 - mdp.discountFactor)))/math.log(mdp.discountFactor))

	print("Chosen value of H is : ", H)
	
	## Initializations
	it = 0
	samples = 0
	initial_iterations = 1*mdp.numStates*mdp.numActions
	R_s_a = np.zeros((mdp.numStates,mdp.numActions))
	N_s_a_sprime = np.zeros((mdp.numStates,mdp.numActions,mdp.numStates), dtype=np.int)
	N_s_a = np.zeros((mdp.numStates,mdp.numActions), dtype=np.int)
	P_s_a_sprime = np.zeros((mdp.numStates,mdp.numActions,mdp.numStates))
	Qupper = mdp.Vmax*np.ones((numPolicies, mdp.numStates))
	QupperMBAE = mdp.Vmax*np.ones((numPolicies, mdp.numStates))
	Qlower = np.zeros((numPolicies, mdp.numStates))
	Qstar = (mdp.Vmax/2)*np.ones((numPolicies, mdp.numStates))
	QstarMBAE = (mdp.Vmax/2)*np.ones((numPolicies, mdp.numStates))
	QlowerMBAE = np.zeros((numPolicies, mdp.numStates))
	P_tilda = np.zeros((numPolicies, mdp.numStates,mdp.numStates))
	P_lower_tilda = np.zeros((numPolicies, mdp.numStates,mdp.numStates))
	VlowerMBAE = np.zeros((numPolicies, mdp.numStates))
	VupperMBAE = mdp.Vmax*np.ones((numPolicies, mdp.numStates))
	Vstar = (mdp.Vmax/2)*np.ones((numPolicies, mdp.numStates))
	discovered_states = set([start_state])
	deltadeltaV = np.zeros((mdp.numStates))

	#sampling all state action pairs
	while it < initial_iterations:
		for state in range(mdp.numStates):
			for act in range(mdp.numActions):
				it = it + 1
				ss, rr = mdp.simulate(state, act)
				R_s_a[state][act] = (rr + R_s_a[state][act]*N_s_a[state][act])/(N_s_a[state][act]+1)
				N_s_a[state][act] = N_s_a[state][act] + 1
				N_s_a_sprime[state][act][ss] = N_s_a_sprime[state][act][ss] + 1
				# P_s_a_sprime = np.copy(N_s_a_sprime)
				for s2 in range(mdp.numStates):
					P_s_a_sprime[state][act][s2] = (float)(N_s_a_sprime[state][act][s2])/N_s_a[state][act]
	samples += initial_iterations

	if(use_ddv):
		ff = open(mdp.filename+'-policyddv' + str(randomseed) +'.txt', 'wb')
	else:
		ff = open(mdp.filename+'-policy' + str(randomseed) +'.txt', 'wb')
	while samples<MAX_ITERATION_LIMIT:
		
		# print counts
		if(policyMethod == 0):
			for p in range(numPolicies):
				# print "Policy Number : ", p
				current_policy = policies[p]

				for i in range(mdp.numStates):
					# print "For state ", i, " doing UpperP"
					if(N_s_a[i][current_policy[i]]>0):
						P_tilda[p][i] = UpperP(
							i,
							current_policy[i],
							delta,
							N_s_a_sprime[i][current_policy[i]],
							mdp.numStates,
							Qupper[p],
							False
							)
						P_lower_tilda[p][i] = LowerP(
							i,
							current_policy[i],
							delta,
							N_s_a_sprime[i][current_policy[i]],
							mdp.numStates,
							Qlower[p],
							False
							)

				#computing all three versions of Q given current knowlege of transition and reward matrices

				Qupper[p] = itConvergencePolicy(
					Qupper[p],
					getRewards(R_s_a, current_policy),
					P_tilda[p],
					mdp.discountFactor,
					epsilon,
					converge_iterations,
					epsilon_convergence
					)
				Qlower[p] = itConvergencePolicy(
					Qlower[p],
					getRewards(R_s_a, current_policy),
					P_lower_tilda[p],
					mdp.discountFactor,
					epsilon,
					converge_iterations,
					epsilon_convergence
					)	
				Qstar[p] = itConvergencePolicy(
					Qstar[p],
					getRewards(R_s_a, current_policy),
					getProb(P_s_a_sprime, current_policy),
					mdp.discountFactor,
					epsilon,
					converge_iterations,
					epsilon_convergence
					)	

				# import pdb; pdb.set_trace()
				# print "mbie bounds calculated!"
				for internal in range(converge_iterations):
					
					oldQlowerMBAE = np.copy(QlowerMBAE[p][start_state])
					for state in range(mdp.numStates):
						# for act in range(mdp.numActions):
						act = current_policy[state]
							# Calculations for QupperMBAE and QlowerMBAE
						firstterm = R_s_a[state][act]
						# print VupperMBAE[p]
						secondterm = mdp.discountFactor*np.sum(VupperMBAE[p]*(P_s_a_sprime[state][act]))
						lower_secondterm = mdp.discountFactor*np.sum(VlowerMBAE[p]*(P_s_a_sprime[state][act]))
						star_secondterm = mdp.discountFactor*np.sum(Vstar[p]*(P_s_a_sprime[state][act]))
						thirdterm = mdp.Vmax*math.sqrt((math.log(c*(samples**2)*mdp.numStates*1)-math.log(delta))/N_s_a[state][act])
						QupperMBAE[p][state] = firstterm + secondterm + thirdterm
						QlowerMBAE[p][state] = firstterm + lower_secondterm - thirdterm
						QstarMBAE[p][state] = firstterm + star_secondterm
						VupperMBAE[p][state] = QupperMBAE[p][state]
						VlowerMBAE[p][state] = QlowerMBAE[p][state]
						Vstar[p][state] = QstarMBAE[p][state]
					if(np.linalg.norm(oldQlowerMBAE-QlowerMBAE[p][start_state])<=epsilon_convergence):
						break
				# print VupperMBAE[p]

		
			# import pdb; pdb.set_trace()
			policy1Index = np.argmax(QstarMBAE[:,start_state])
			policy2choices = QupperMBAE[:,start_state].argsort()[::-1]
			if(policy2choices[0]==policy1Index):
				policy2Index = policy2choices[1]
			else:
				policy2Index = policy2choices[0]

			# print "polivyiniex", QstarMBAE[:,start_state]
		#action switching policy iteration
		# elif(policyMethod==1):
		# 	# print "Choosing 2nd method for finding policy"
		# 	p = np.random.randint(0,numPolicies)
		# 	current_policy = policies[p]
		# 	while True:
		# 		for internal in range(converge_iterations):
		# 			oldQlowerMBAE = np.copy(QlowerMBAE[p][start_state])
		# 			for state in range(mdp.numStates):
		# 				# for act in range(mdp.numActions):
		# 				act = policies[p][state]
		# 					# Calculations for QupperMBAE and QlowerMBAE
		# 				firstterm = R_s_a[state][act]
		# 				secondterm = mdp.discountFactor*np.sum(VupperMBAE[p]*(P_s_a_sprime[state][act]))
		# 				lower_secondterm = mdp.discountFactor*np.sum(VlowerMBAE[p]*(P_s_a_sprime[state][act]))
		# 				star_secondterm = mdp.discountFactor*np.sum(Vstar[p]*(P_s_a_sprime[state][act]))
		# 				thirdterm = mdp.Vmax*math.sqrt((math.log(c*(samples**2)*mdp.numStates*1)-math.log(delta))/N_s_a[state][act])
		# 				QupperMBAE[p][state] = firstterm + secondterm + thirdterm
		# 				QlowerMBAE[p][state] = firstterm + lower_secondterm - thirdterm
		# 				QstarMBAE[p][state] = firstterm + star_secondterm
		# 				VupperMBAE[p][state] = QupperMBAE[p][state]
		# 				VlowerMBAE[p][state] = QlowerMBAE[p][state]
		# 				Vstar[p][state] = QstarMBAE[p][state]
		# 			if(np.linalg.norm(oldQlowerMBAE-QlowerMBAE[p][start_state])<=epsilon_convergence):
		# 				break

		# 		hasChanged = False

		# 		for st in range(mdp.numStates):
		# 			for ac in range(mdp.numActions):
		# 				if(current_policy[st]==ac):
		# 					continue
		# 				else:	
		# 					tempfirstterm = R_s_a[st][ac]
		# 					tempsecondterm = mdp.discountFactor*np.sum(VupperMBAE[p]*(P_s_a_sprime[st][ac]))
		# 					lower_secondterm = mdp.discountFactor*np.sum(VlowerMBAE[p]*(P_s_a_sprime[st][ac]))
		# 					star_secondterm = mdp.discountFactor*np.sum(Vstar[p]*(P_s_a_sprime[st][ac]))
		# 					tempthirdterm = mdp.Vmax*math.sqrt((math.log(c*(samples**2)*mdp.numStates*1)-math.log(delta))/N_s_a[st][ac])
		# 					tempQupperMBAE = tempfirstterm + tempsecondterm + tempthirdterm
		# 					tempQlowerMBAE = tempfirstterm + lower_secondterm - tempthirdterm
		# 					tempQstarMBAE = tempfirstterm + star_secondterm
		# 					tempVupperMBAE = tempQupperMBAE
		# 					tempVlowerMBAE = tempQlowerMBAE
		# 					tempVstar = tempQstarMBAE

		# 				if(tempVstar>Vstar[p][st]):
		# 				# if(tempVupperMBAE>VupperMBAE[p][st]):
		# 					current_policy[st] = ac
		# 					hasChanged = True
		# 					break

		# 			# if(hasChanged):
		# 			# 	break


		# 		if hasChanged:
		# 			p = indexOfPolicy(current_policy,mdp.numStates,mdp.numActions)
		# 			print "Changing to ",current_policy, p
		# 		else:
		# 			policy1Index = p
		# 			# print "Found first best policy!",policy1Index
		# 			break

		# 	p = np.random.randint(0,numPolicies)
		# 	current_policy = policies[p]
		# 	while True:
		# 		for internal in range(converge_iterations):
		# 			oldQlowerMBAE = np.copy(QlowerMBAE[p][start_state])
		# 			for state in range(mdp.numStates):
		# 				# for act in range(mdp.numActions):
		# 				act = policies[p][state]
		# 					# Calculations for QupperMBAE and QlowerMBAE
		# 				firstterm = R_s_a[state][act]
		# 				secondterm = mdp.discountFactor*np.sum(VupperMBAE[p]*(P_s_a_sprime[state][act]))
		# 				lower_secondterm = mdp.discountFactor*np.sum(VlowerMBAE[p]*(P_s_a_sprime[state][act]))
		# 				star_secondterm = mdp.discountFactor*np.sum(Vstar[p]*(P_s_a_sprime[state][act]))
		# 				thirdterm = mdp.Vmax*math.sqrt((math.log(c*(samples**2)*mdp.numStates*1)-math.log(delta))/N_s_a[state][act])
		# 				QupperMBAE[p][state] = firstterm + secondterm + thirdterm
		# 				QlowerMBAE[p][state] = firstterm + lower_secondterm - thirdterm
		# 				QstarMBAE[p][state] = firstterm + star_secondterm
		# 				VupperMBAE[p][state] = QupperMBAE[p][state]
		# 				VlowerMBAE[p][state] = QlowerMBAE[p][state]
		# 				Vstar[p][state] = QstarMBAE[p][state]
		# 			if(np.linalg.norm(oldQlowerMBAE-QlowerMBAE[p][start_state])<=epsilon_convergence):
		# 				break

		# 		hasChanged = False

		# 		for st in range(mdp.numStates):
		# 			for ac in range(mdp.numActions):
		# 				if(current_policy[st]==ac):
		# 					continue
		# 				else:	
		# 					tempfirstterm = R_s_a[st][ac]
		# 					tempsecondterm = mdp.discountFactor*np.sum(VupperMBAE[p]*(P_s_a_sprime[st][ac]))
		# 					lower_secondterm = mdp.discountFactor*np.sum(VlowerMBAE[p]*(P_s_a_sprime[st][ac]))
		# 					star_secondterm = mdp.discountFactor*np.sum(Vstar[p]*(P_s_a_sprime[st][ac]))
		# 					tempthirdterm = mdp.Vmax*math.sqrt((math.log(c*(samples**2)*mdp.numStates*1)-math.log(delta))/N_s_a[st][ac])
		# 					tempQupperMBAE = tempfirstterm + tempsecondterm + tempthirdterm
		# 					tempQlowerMBAE = tempfirstterm + lower_secondterm - tempthirdterm
		# 					tempQstarMBAE = tempfirstterm + star_secondterm
		# 					tempVupperMBAE = tempQupperMBAE
		# 					tempVlowerMBAE = tempQlowerMBAE
		# 					tempVstar = tempQstarMBAE

		# 				if(tempVupperMBAE>VupperMBAE[p][st]):
		# 				# if(tempVupperMBAE>VupperMBAE[p][st]):
		# 					current_policy[st] = ac
		# 					hasChanged = True
		# 					break

		# 			# if(hasChanged):
		# 			# 	break


		# 		if hasChanged:
		# 			p = indexOfPolicy(current_policy,mdp.numStates,mdp.numActions)
		# 			print "Changing to ",current_policy, p, "Vupper"
		# 		else:
		# 			policy3Index = p
		# 			# print "Found first best policy!",policy1Index
		# 			break


		# 	### Hill Climbing for second policy
		# 	# print "Finding 2nd best policy"
		# 	# print VupperMBAE[:,start_state]
		# 	oneNeighbours = allOneNeighbours(policies[policy3Index], mdp.numActions)

		# 	maxVupper = -float("inf")
		# 	bestPolicyIndex = -1
		# 	hasChanged = False
		# 	for p1 in oneNeighbours:
		# 		# print p1
		# 		# print indexOfPolicy(p1,mdp.numStates,mdp.numActions)
		# 		p1Index = indexOfPolicy(p1,mdp.numStates,mdp.numActions)

		# 		for internal in range(converge_iterations):
		# 			oldQlowerMBAE = np.copy(QlowerMBAE[p1Index][start_state])
		# 			for state in range(mdp.numStates):
		# 				# for act in range(mdp.numActions):
		# 				act = p1[state]
		# 					# Calculations for QupperMBAE and QlowerMBAE
		# 				firstterm = R_s_a[state][act]
		# 				secondterm = mdp.discountFactor*np.sum(VupperMBAE[p1Index]*(P_s_a_sprime[state][act]))
		# 				lower_secondterm = mdp.discountFactor*np.sum(VlowerMBAE[p1Index]*(P_s_a_sprime[state][act]))
		# 				star_secondterm = mdp.discountFactor*np.sum(Vstar[p1Index]*(P_s_a_sprime[state][act]))
		# 				thirdterm = mdp.Vmax*math.sqrt((math.log(c*(samples**2)*mdp.numStates*1)-math.log(delta))/N_s_a[state][act])
		# 				QupperMBAE[p1Index][state] = firstterm + secondterm + thirdterm
		# 				QlowerMBAE[p1Index][state] = firstterm + lower_secondterm - thirdterm
		# 				QstarMBAE[p1Index][state] = firstterm + star_secondterm
		# 				VupperMBAE[p1Index][state] = QupperMBAE[p1Index][state]
		# 				VlowerMBAE[p1Index][state] = QlowerMBAE[p1Index][state]
		# 				Vstar[p1Index][state] = QstarMBAE[p1Index][state]
		# 			if(np.linalg.norm(oldQlowerMBAE-QlowerMBAE[p1Index][start_state])<=epsilon_convergence):
		# 				break

		# 			if(VupperMBAE[p1Index][start_state]>maxVupper):
		# 				bestPolicyIndex = p1Index
		# 				maxVupper = VupperMBAE[p1Index][start_state]
		# 				# print Vstar[0]
		# 				hasChanged = True

		# 	if hasChanged:
		# 		p = bestPolicyIndex
		# 		policy2Index = bestPolicyIndex
				# print "Second best policy ", policy2Index
		#policy iteration methods end

		h=0
		policy1 = policies[policy1Index]
		policy2 = policies[policy2Index]
		# print QlowerMBAE
		# print policy2
		# print QstarMBAE[:,start_state]
		state = start_state
		if (samples%1000)<100:
			if(verbose==0):
				# print QupperMBAE[:,start_state]
				# print Qstar[:,start_state]
				ff.write(str(samples))
				ff.write('\t')
				if(plot_vstar):
					# ff.write(str(Vstar[policy1Index][start_state]))
					ff.write(str(evaluatePolicy(mdp, policy1, start_state)))
					print(evaluatePolicy(mdp, policy1, start_state))
					print(policy1, policy2)
				else:
					ff.write(str(QupperMBAE[policy2Index][start_state]-QlowerMBAE[policy1Index][start_state]))#-epsilon*(1-mdp.discountFactor)/2 

				print(samples, QupperMBAE[policy2Index][start_state]-QlowerMBAE[policy1Index][start_state])
				ff.write('\n')
			else:
				print(samples)
				print(QupperMBAE[:,start_state], QlowerMBAE[:,start_state])
			# np.savetxt(ff, (policies[policy1Index]), fmt="%d")
		counts[policy1Index] += 1
		counts[policy2Index] += 1

		polList = [policy1Index, policy2Index]


		if(use_ddv):
			## Caclulate V for all states
			for pnum in polList:
				policiesfddv = policies[pnum]
				# print "Getting DDV values"
				for st in list(discovered_states):
					ac = policiesfddv[st]
					#### Compute del del V
					deltadeltaV[st] = CalculateDelDelV(
						st,
						ac,
						mdp,
						N_s_a_sprime,
						QupperMBAE[pnum],
						QlowerMBAE[pnum],
						None,
						None,
						start_state,
						P_s_a_sprime,
						P_tilda[pnum],
						P_lower_tilda[pnum],
						R_s_a,
						epsilon,
						delta,
						converge_iterations,
						epsilon_convergence,
						policiesfddv
						)

				# print deltadeltaV
				cs = np.argmax(deltadeltaV)
				ca = policiesfddv[cs]
				# print deltadeltaV, cs, ca
				# print deltadeltaV, policy1, policy2
				# print "Found max state for DDV: ",cs,ca
				# time.sleep(0.1)
				ss, rr = mdp.simulate(cs, ca)
				print("Policy is ", policiesfddv)
				print("Sampling ", cs, ca)

				time.sleep(0.1)	
				samples = samples +  1
				discovered_states.add(ss)
				R_s_a[cs][ca] = (rr + R_s_a[cs][ca]*N_s_a[cs][ca])/(N_s_a[cs][ca]+1)
				N_s_a[cs][ca] += 1
				N_s_a_sprime[cs][ca][ss] += 1
				# P_s_a_sprime = np.copy(N_s_a_sprime)
				for s2 in range(mdp.numStates):
					P_s_a_sprime[cs][ca][s2] = (float)(N_s_a_sprime[cs][ca][s2])/N_s_a[cs][ca]

		elif(mc):

			deltaW = np.zeros(mdp.numStates)
			mu = np.zeros(mdp.numStates)
			D = np.zeros(mdp.numStates)

			mu[start_state] = 1

			for t in range(H):
				D = D + (mdp.discountFactor**t) * mu
				mu = prob_step(mu, P_s_a_sprime, policy1)

			for st in range(mdp.numStates):
				#transition uncertainty for given s, pi(s)
				deltaW[st] = delW(st, policy1[st], delta, N_s_a_sprime[st][policy1[st]], mdp.numStates, False)

			st = np.argmax(deltaW * D)
			ac = policy1[st]

			ss, rr = mdp.simulate(st, ac)
			samples += 1

			R_s_a[state][act] = (rr + R_s_a[state][act]*N_s_a[state][act])/(N_s_a[state][act]+1)
			N_s_a[state][act] += 1
			N_s_a_sprime[state][act][ss] += 1
			# P_s_a_sprime = np.copy(N_s_a_sprime)
			for s2 in range(mdp.numStates):
				P_s_a_sprime[state][act][s2] = (float)(N_s_a_sprime[state][act][s2])/N_s_a[state][act]


			deltaW = np.zeros(mdp.numStates)
			mu = np.zeros(mdp.numStates)
			D = np.zeros(mdp.numStates)

			mu[start_state] = 1

			for t in range(H):
				D = D + (mdp.discountFactor**t) * mu
				mu = prob_step(mu, P_s_a_sprime, policy2)

			for st in range(mdp.numStates):
				#transition uncertainty for given s, pi(s)
				deltaW[st] = delW(st, policy2[st], delta, N_s_a_sprime[st][policy2[st]], mdp.numStates, False)

			st = np.argmax(deltaW * D)
			ac = policy2[st]

			ss, rr = mdp.simulate(st, ac)
			samples += 1

			R_s_a[state][act] = (rr + R_s_a[state][act]*N_s_a[state][act])/(N_s_a[state][act]+1)
			N_s_a[state][act] += 1
			N_s_a_sprime[state][act][ss] += 1
			# P_s_a_sprime = np.copy(N_s_a_sprime)
			for s2 in range(mdp.numStates):
				P_s_a_sprime[state][act][s2] = (float)(N_s_a_sprime[state][act][s2])/N_s_a[state][act]

		else:
			while h<H:
				act = policy1[state]
				# print "------>",current_state, current_action
				ss, rr = mdp.simulate(state, act)
				samples+=1
				R_s_a[state][act] = (rr + R_s_a[state][act]*N_s_a[state][act])/(N_s_a[state][act]+1)
				N_s_a[state][act] += 1
				N_s_a_sprime[state][act][ss] += 1
				# P_s_a_sprime = np.copy(N_s_a_sprime)
				for s2 in range(mdp.numStates):
					P_s_a_sprime[state][act][s2] = (float)(N_s_a_sprime[state][act][s2])/N_s_a[state][act]
				state = ss
				h+=1

			h=0

			state = start_state
			# print "episode : "
			while h<H:
				# print state, 
				act = policy2[state]
				ss, rr = mdp.simulate(state, act)
				samples+=1
				R_s_a[state][act] = (rr + R_s_a[state][act]*N_s_a[state][act])/(N_s_a[state][act]+1)
				N_s_a[state][act] += 1
				N_s_a_sprime[state][act][ss] += 1
				# P_s_a_sprime = np.copy(N_s_a_sprime)
				for s2 in range(mdp.numStates):
					P_s_a_sprime[state][act][s2] = (float)(N_s_a_sprime[state][act][s2])/N_s_a[state][act]
				state = ss
				h+=1

		if (samples%1000)<1000:
			if(QupperMBAE[policy2Index][start_state]-QlowerMBAE[policy1Index][start_state]-epsilon*(1-mdp.discountFactor)/2<0):
				print(Qupper[policy2Index][start_state],Qstar[policy1Index][start_state],epsilon*(1-mdp.discountFactor)/2)
				print("Epsilon condition reached at ",samples, " samples")
				print(policy1)
				return(policy1)
			else:
				# print QupperMBAE[policy2Index][start_state],QstarMBAE[policy1Index][start_state],epsilon*(1-mdp.discountFactor)/2
				pass
			# print "ends here"

	ff.close()
	return policy1
Пример #7
0
def RoundRobin(mdp, start_state=0, epsilon=4, randomseed=None, delta=0.1):
    global MAX_ITERATION_LIMIT, c
    if (randomseed is not None):
        np.random.seed(randomseed)
    iteration = 0
    it = 0
    initial_iterations = 1 * mdp.numStates * mdp.numActions
    rewards_s_a_sprime = np.zeros(
        (mdp.numStates, mdp.numActions, mdp.numStates))
    R_s_a = np.zeros((mdp.numStates, mdp.numActions))
    sampled_frequency_s_a = np.zeros((mdp.numStates, mdp.numActions))
    N_s_a_sprime = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates))
    P = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates))
    P_tilda = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates))
    P_lower_tilda = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates))
    VlowerMBAE = np.zeros((mdp.numStates))
    Vlower = np.zeros((mdp.numStates))
    Vstar = (mdp.Vmax / 2) * np.ones((mdp.numStates))
    VupperMBAE = mdp.Vmax * np.ones((mdp.numStates))
    Vupper = mdp.Vmax * np.random.random([mdp.numStates])
    QlowerMBAE = np.zeros((mdp.numStates, mdp.numActions))
    Qlower = np.zeros((mdp.numStates, mdp.numActions))
    Qstar = (mdp.Vmax / 2) * np.ones((mdp.numStates, mdp.numActions))
    QupperMBAE = mdp.Vmax * np.ones((mdp.numStates, mdp.numActions))
    Qupper = mdp.Vmax * np.random.random([mdp.numStates, mdp.numActions])
    final_policy = (-1) * np.ones((mdp.numStates), dtype=np.int)
    states_to_sample = range(mdp.numStates)
    colliding_values = np.zeros((mdp.numStates))
    is_converged = 0

    ### Initial sampling for all state action pairs
    while it < initial_iterations:
        for state in range(mdp.numStates):
            for act in range(mdp.numActions):
                it += 1
                s_prime, r = mdp.simulate(state, act)
                rewards_s_a_sprime[state][act][s_prime] += r
                R_s_a[state][act] = (r + R_s_a[state][act] *
                                     sampled_frequency_s_a[state][act]) / (
                                         sampled_frequency_s_a[state][act] + 1)
                sampled_frequency_s_a[state][act] += 1
                N_s_a_sprime[state][act][s_prime] += 1
                for s2 in range(mdp.numStates):
                    P[state][act][s_prime] = (float)(
                        N_s_a_sprime[state][act]
                        [s_prime]) / sampled_frequency_s_a[state][act]

    ### Calculating V, Q estimates thus far
    for state in range(mdp.numStates):
        for act in range(mdp.numActions):
            # Calculations for QupperMBAE and QlowerMBAE
            firstterm = np.sum(rewards_s_a_sprime[state]
                               [act]) / sampled_frequency_s_a[state][act]
            secondterm = mdp.discountFactor * np.sum(
                VupperMBAE *
                (N_s_a_sprime[state][act] / sampled_frequency_s_a[state][act]))
            #secondterm = mdp.discountFactor*sum(VupperMBAE[ss]*N_s_a_sprime[state][act][ss]/sampled_frequency_s_a[state][act] for ss in range(mdp.numStates))
            lower_secondterm = mdp.discountFactor * np.sum(
                VlowerMBAE *
                (N_s_a_sprime[state][act] / sampled_frequency_s_a[state][act]))
            #lower_secondterm = mdp.discountFactor*sum(VlowerMBAE[ss]*N_s_a_sprime[state][act][ss]/sampled_frequency_s_a[state][act] for ss in range(mdp.numStates))
            thirdterm = mdp.Vmax * math.sqrt(
                (math.log(c * mdp.numStates * mdp.numActions) -
                 math.log(delta)) / sampled_frequency_s_a[state][act])
            #QupperMBAE[state][act] = (float)(sum(rewards_s_a_sprime[state][act][ss] for ss in range(mdp.numStates))/sampled_frequency_s_a[state][act]) + secondterm + thirdterm
            QupperMBAE[state][act] = firstterm + secondterm + thirdterm
            QlowerMBAE[state][act] = firstterm + lower_secondterm - thirdterm

        VupperMBAE[state] = np.amax(QupperMBAE[state])
        VlowerMBAE[state] = np.amax(QlowerMBAE[state])

    Qupper = np.copy(QupperMBAE)
    Qlower = np.copy(QlowerMBAE)

    if (verbose == 0):
        outp = open(mdp.filename + '-rr' + str(randomseed) + '.txt', 'wb')
    ff = open(mdp.filename + '-rr-samples.txt', 'w+')

    while iteration < MAX_ITERATION_LIMIT:
        # print "Sampling state ", max_collision_state[0]
        # print colliding_values
        for state1 in range(mdp.numStates):
            # print "Sampling ", state1, "for this round"
            for act1 in range(mdp.numActions):
                iteration += 1
                sampled_frequency_s_a[state1][act1] += 1

                # Simluate the MDP with this state,action and update counts
                #### TRying 10 continuous simulations
                for t in range(1):
                    s_prime, r = mdp.simulate(state1, act1)
                    rewards_s_a_sprime[state1][act1][s_prime] += r
                    R_s_a[state][act] = (
                        r +
                        R_s_a[state][act] * sampled_frequency_s_a[state][act]
                    ) / (sampled_frequency_s_a[state][act] + 1)
                    N_s_a_sprime[state1][act1][s_prime] += 1
                    for s2 in range(mdp.numStates):
                        P[state1][act][s_prime] = (float)(
                            N_s_a_sprime[state1][act]
                            [s_prime]) / sampled_frequency_s_a[state1][act]

                ## Calculating Q and V values
                for i in range(mdp.numStates):
                    for j in range(mdp.numActions):
                        if (sampled_frequency_s_a[i][j] > 0):
                            P_tilda[i][j] = UpperP(i, j, delta,
                                                   N_s_a_sprime[i][j],
                                                   mdp.numStates, Vupper,
                                                   False)
                            P_lower_tilda[i][j] = LowerP(
                                i, j, delta, N_s_a_sprime[i][j], mdp.numStates,
                                Vlower, False)

                Qupper, Vupper = iteratedConvergence(Qupper, R_s_a, P_tilda,
                                                     mdp.discountFactor,
                                                     epsilon,
                                                     converge_iterations,
                                                     epsilon_convergence)
                Qlower, Vlower = iteratedConvergence(
                    Qlower, R_s_a, P_lower_tilda, mdp.discountFactor, epsilon,
                    converge_iterations, epsilon_convergence)

                # Calculations for QupperMBAE and QlowerMBAE
                #### This involved a two for-loop and iterating convergence
                for state in range(mdp.numStates):
                    for act in range(mdp.numActions):
                        # Calculations for QupperMBAE and QlowerMBAE
                        # Calculations for QupperMBAE and QlowerMBAE
                        firstterm = np.sum(
                            rewards_s_a_sprime[state]
                            [act]) / sampled_frequency_s_a[state][act]
                        secondterm = mdp.discountFactor * np.sum(
                            VupperMBAE * (N_s_a_sprime[state][act] /
                                          sampled_frequency_s_a[state][act]))
                        #secondterm = mdp.discountFactor*sum(VupperMBAE[ss]*N_s_a_sprime[state][act][ss]/sampled_frequency_s_a[state][act] for ss in range(mdp.numStates))
                        lower_secondterm = mdp.discountFactor * np.sum(
                            VlowerMBAE * (N_s_a_sprime[state][act] /
                                          sampled_frequency_s_a[state][act]))
                        star_secondterm = mdp.discountFactor * np.sum(
                            Vstar * (N_s_a_sprime[state][act] /
                                     sampled_frequency_s_a[state][act]))
                        #lower_secondterm = mdp.discountFactor*sum(VlowerMBAE[ss]*N_s_a_sprime[state][act][ss]/sampled_frequency_s_a[state][act] for ss in range(mdp.numStates))
                        thirdterm = mdp.Vmax * math.sqrt(
                            (math.log(c * (iteration**2) * mdp.numStates *
                                      mdp.numActions) - math.log(delta)) /
                            sampled_frequency_s_a[state][act])
                        #QupperMBAE[state][act] = (float)(sum(rewards_s_a_sprime[state][act][ss] for ss in range(mdp.numStates))/sampled_frequency_s_a[state][act]) + secondterm + thirdterm
                        QupperMBAE[state][
                            act] = firstterm + secondterm + thirdterm
                        QlowerMBAE[state][
                            act] = firstterm + lower_secondterm - thirdterm
                        Qstar[state][act] = firstterm + star_secondterm
                        # Calculation for Vstar
                        # t = (float)N_s_a_sprime[state][act][stateprime]/sampled_frequency_s_a[state][act]
                        # val = t*(rewards_s_a[state][act][stateprime]+mdp.discountFactor*Vstar[stateprime])
                    VupperMBAE[state] = np.amax(QupperMBAE[state])
                    VlowerMBAE[state] = np.amax(QlowerMBAE[state])
                    Vstar[state] = np.amax(Qstar[state])

        count = 0
        # print iteration, (QupperMBAE[start_state][acList[1]]-QlowerMBAE[start_state][acList[0]])/epsilon, sampled_frequency_s_a
        if (iteration % 100 == 0):
            for i in range(mdp.numStates):
                if (final_policy[i] == -1):
                    final_policy[i] = bestTwoActions(mdp, i, QlowerMBAE,
                                                     QupperMBAE, Qstar)[0]
            acList = bestTwoActions(mdp, start_state, Qlower, Qupper, Qstar)
            if (verbose == 0):
                outp.write(str(iteration))
                outp.write('\t')
                outp.write(
                    str(QupperMBAE[start_state][acList[1]] -
                        QlowerMBAE[start_state][acList[0]])
                )  #-epsilon*(1-mdp.discountFactor)/2
                # print(str(QupperMBAE[start_state][acList[1]]-QlowerMBAE[start_state][acList[0]]))
                # print(iteration, QupperMBAE[start_state])
                # outp.write(str(evaluatePolicy(mdp, final_policy, start_state)))
                print(str(evaluatePolicy(mdp, final_policy, start_state)))
                outp.write('\n')
            else:
                print(iteration, (QupperMBAE[start_state][acList[1]] -
                                  QlowerMBAE[start_state][acList[0]]))
                # print iteration, (Qupper[start_state][acList[1]]-Qlower[start_state][acList[0]])

            np.savetxt(ff, sampled_frequency_s_a, delimiter=',')
            ff.write('\n')
            # print iteration

        #### Check epsilon condition for only starting state
        acList = bestTwoActions(mdp, start_state, Qlower, Qupper, Qstar)
        if (QupperMBAE[start_state][acList[1]] -
                QlowerMBAE[start_state][acList[0]] < epsilon *
            (1 - mdp.discountFactor) / 2 and iteration > 50):
            print(
                QupperMBAE[start_state][acList[1]] -
                QlowerMBAE[start_state][acList[0]], "<",
                epsilon * (1 - mdp.discountFactor) / 2)
            # if(count==mdp.numStates):
            acList = bestTwoActions(mdp, start_state, QlowerMBAE, QupperMBAE,
                                    Qstar)
            a = open('final' + mdp.filename + '-rr.txt', 'a+')
            a.write(str(iteration) + '\n')
            a.close()
            print("Setting final_policy of ", start_state, " to", acList[0])
            final_policy[start_state] = acList[0]
            print("Iterations taken : ", iteration)
            print("Returning the policy :", final_policy)
            for i in range(mdp.numStates):
                if (final_policy[i] == -1):
                    final_policy[i] = bestTwoActions(mdp, i, QlowerMBAE,
                                                     QupperMBAE, Qstar)[0]
            return final_policy

    for i in range(mdp.numStates):
        if (final_policy[i] == -1):
            final_policy[i] = bestTwoActions(mdp, i, QlowerMBAE, QupperMBAE,
                                             Qstar)[0]
    return final_policy
Пример #8
0
def LUCBBound(mdp, start_state=0, epsilon=4, delta=0.1, fileprint=1):
    global MAX_ITERATION_LIMIT, c
    iteration = 0
    it = 0
    H = int((math.log(mdp.Vmax) + math.log(6.0 / epsilon)) /
            (1 - mdp.discountFactor))
    initial_iterations = 1 * mdp.numStates * mdp.numActions
    rewards_s_a_sprime = np.zeros(
        (mdp.numStates, mdp.numActions, mdp.numStates))
    R_s_a = np.zeros((mdp.numStates, mdp.numActions))
    sampled_frequency_s_a = np.zeros((mdp.numStates, mdp.numActions))
    N_s_a_sprime = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates))
    P = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates))
    P_tilda = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates))
    P_lower_tilda = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates))
    VlowerMBAE = np.zeros((mdp.numStates))
    Vlower = np.zeros((mdp.numStates))
    Vstar = (mdp.Vmax / 2) * np.ones((mdp.numStates))
    Vupper = mdp.Vmax * np.ones((mdp.numStates))
    Qlower = np.zeros((mdp.numStates, mdp.numActions))
    VupperMBAE = mdp.Vmax * np.ones((mdp.numStates))
    QlowerMBAE = np.zeros((mdp.numStates, mdp.numActions))
    Qstar = (mdp.Vmax / 2) * np.ones((mdp.numStates, mdp.numActions))
    QupperMBAE = mdp.Vmax * np.ones((mdp.numStates, mdp.numActions))
    Qupper = mdp.Vmax * np.ones((mdp.numStates, mdp.numActions))
    final_policy = (-1) * np.ones((mdp.numStates), dtype=np.int)
    states_to_sample = range(mdp.numStates)
    colliding_values = np.zeros((mdp.numStates))
    converge_iterations = 10000
    epsilon_convergence = 1e-4
    is_converged = 0
    print "Vmax", mdp.Vmax

    ### Initial sampling for all state action pairs
    while it < initial_iterations:
        for state in range(mdp.numStates):
            for act in range(mdp.numActions):
                it += 1
                s_prime, r = mdp.simulate(state, act)
                rewards_s_a_sprime[state][act][s_prime] += r
                R_s_a[state][act] = (r + R_s_a[state][act] *
                                     sampled_frequency_s_a[state][act]) / (
                                         sampled_frequency_s_a[state][act] + 1)
                sampled_frequency_s_a[state][act] += 1
                N_s_a_sprime[state][act][s_prime] += 1

    ### Calculating V, Q estimates thus far MBAE
    for internal in range(converge_iterations):
        oldQlowerMBAE = np.copy(QlowerMBAE[start_state])
        for state in range(mdp.numStates):
            for act in range(mdp.numActions):
                # Calculations for QupperMBAE and QlowerMBAE
                firstterm = np.sum(rewards_s_a_sprime[state]
                                   [act]) / sampled_frequency_s_a[state][act]
                secondterm = mdp.discountFactor * np.sum(
                    VupperMBAE * (N_s_a_sprime[state][act] /
                                  sampled_frequency_s_a[state][act]))
                #secondterm = mdp.discountFactor*sum(VupperMBAE[ss]*N_s_a_sprime[state][act][ss]/sampled_frequency_s_a[state][act] for ss in range(mdp.numStates))
                lower_secondterm = mdp.discountFactor * np.sum(
                    VlowerMBAE * (N_s_a_sprime[state][act] /
                                  sampled_frequency_s_a[state][act]))
                #lower_secondterm = mdp.discountFactor*sum(VlowerMBAE[ss]*N_s_a_sprime[state][act][ss]/sampled_frequency_s_a[state][act] for ss in range(mdp.numStates))
                thirdterm = mdp.Vmax * math.sqrt(
                    (math.log(c * mdp.numStates * mdp.numActions) -
                     math.log(delta)) / sampled_frequency_s_a[state][act])
                #QupperMBAE[state][act] = (float)(sum(rewards_s_a_sprime[state][act][ss] for ss in range(mdp.numStates))/sampled_frequency_s_a[state][act]) + secondterm + thirdterm
                QupperMBAE[state][act] = firstterm + secondterm + thirdterm
                QlowerMBAE[state][
                    act] = firstterm + lower_secondterm - thirdterm

                # Calculation for Vstar
                # t = (float)N_s_a_sprime[state][act][stateprime]/sampled_frequency_s_a[state][act]
                # val = t*(rewards_s_a[state][act][stateprime]+mdp.discountFactor*Vstar[stateprime])

                # if(state==start_state and abs(VupperMBAE[state]-QupperMBAEmax)<epsilon_convergence):
                # 	VupperMBAE[state] = QupperMBAEmax
                # 	print "Stopping with ", internal, "initial internal iterations"
                # 	is_converged = 1
                # 	break
            VupperMBAE[state] = np.amax(QupperMBAE[state])
            VlowerMBAE[state] = np.amax(QlowerMBAE[state])

        if (np.linalg.norm(oldQlowerMBAE - QlowerMBAE[start_state]) <=
                epsilon_convergence):
            print "Stopping with ", internal, "initial internal iterations"
            break

    if internal == converge_iterations:
        print "Used all iterations"

    print "Initial estimate of QupperMBAE found! Now sampling"

    sys.stdout = open(mdp.filename + '-lucbbound.txt', 'w+')
    ff = open(mdp.filename + '-lucbbound-samples.txt', 'w+')

    h = 0
    state1 = start_state

    while iteration < MAX_ITERATION_LIMIT:
        max_collision_state = [
            sorted(states_to_sample,
                   key=lambda x: colliding_values[x],
                   reverse=True)[0]
        ]
        # print "Sampling state ", max_collision_state[0]
        # print colliding_values
        # print "Sampling ", state1, "for this round"
        if (h % H == 0):
            state1 = start_state
            h = 0
        else:
            state1 = nextstate
        actionsList = bestTwoActions(mdp, state1, Qlower, Qupper, Qstar)
        a = np.random.choice(actionsList)
        iteration += 1
        sampled_frequency_s_a[state1][a] += 1
        for t in range(1):
            s_prime, r = mdp.simulate(state1, a)
            nextstate = s_prime
            rewards_s_a_sprime[state1][a][s_prime] += r
            R_s_a[state][act] = (
                r + R_s_a[state][act] * sampled_frequency_s_a[state][act]) / (
                    sampled_frequency_s_a[state][act] + 1)
            N_s_a_sprime[state1][a][s_prime] += 1

        ## Calculating Q and V values
        for i in range(mdp.numStates):
            for j in range(mdp.numActions):
                if (sampled_frequency_s_a[i][j] > 0):
                    P_tilda[i][j] = UpperP(i, j, delta, N_s_a_sprime[i][j],
                                           mdp.numStates, Vupper, False)
                    P_lower_tilda[i][j] = LowerP(i, j, delta,
                                                 N_s_a_sprime[i][j],
                                                 mdp.numStates, Vupper, False)

        Qupper, Vupper = iteratedConvergence(Qupper, R_s_a, P_tilda,
                                             mdp.discountFactor, epsilon,
                                             converge_iterations,
                                             epsilon_convergence)
        Qlower, Vlower = iteratedConvergence(Qlower, R_s_a, P_lower_tilda,
                                             mdp.discountFactor, epsilon,
                                             converge_iterations,
                                             epsilon_convergence)

        # Calculations for QupperMBAE and QlowerMBAE
        #### This involved a two for-loop and iterating convergence
        for internal in range(converge_iterations):
            oldQlowerMBAE = np.copy(QlowerMBAE[start_state])
            for state in range(mdp.numStates):
                for act in range(mdp.numActions):
                    # Calculations for QupperMBAE and QlowerMBAE
                    # Calculations for QupperMBAE and QlowerMBAE
                    firstterm = np.sum(rewards_s_a_sprime[state][act]
                                       ) / sampled_frequency_s_a[state][act]
                    secondterm = mdp.discountFactor * np.sum(
                        VupperMBAE * (N_s_a_sprime[state][act] /
                                      sampled_frequency_s_a[state][act]))
                    #secondterm = mdp.discountFactor*sum(VupperMBAE[ss]*N_s_a_sprime[state][act][ss]/sampled_frequency_s_a[state][act] for ss in range(mdp.numStates))
                    lower_secondterm = mdp.discountFactor * np.sum(
                        VlowerMBAE * (N_s_a_sprime[state][act] /
                                      sampled_frequency_s_a[state][act]))
                    star_secondterm = mdp.discountFactor * np.sum(
                        Vstar * (N_s_a_sprime[state][act] /
                                 sampled_frequency_s_a[state][act]))
                    #lower_secondterm = mdp.discountFactor*sum(VlowerMBAE[ss]*N_s_a_sprime[state][act][ss]/sampled_frequency_s_a[state][act] for ss in range(mdp.numStates))
                    thirdterm = mdp.Vmax * math.sqrt(
                        (math.log(c * (iteration**2) * mdp.numStates *
                                  mdp.numActions) - math.log(delta)) /
                        sampled_frequency_s_a[state][act])
                    #QupperMBAE[state][act] = (float)(sum(rewards_s_a_sprime[state][act][ss] for ss in range(mdp.numStates))/sampled_frequency_s_a[state][act]) + secondterm + thirdterm
                    QupperMBAE[state][act] = firstterm + secondterm + thirdterm
                    QlowerMBAE[state][
                        act] = firstterm + lower_secondterm - thirdterm
                    Qstar[state][act] = firstterm + star_secondterm
                    # Calculation for Vstar
                    # t = (float)N_s_a_sprime[state][act][stateprime]/sampled_frequency_s_a[state][act]
                    # val = t*(rewards_s_a[state][act][stateprime]+mdp.discountFactor*Vstar[stateprime])
                VupperMBAE[state] = np.amax(QupperMBAE[state])
                VlowerMBAE[state] = np.amax(QlowerMBAE[state])
                Vstar[state] = np.amax(Qstar[state])
            if (np.linalg.norm(oldQlowerMBAE - QlowerMBAE[start_state]) <=
                    epsilon_convergence):
                # print "Stopping with ", internal, "iterations"
                break

        count = 0
        if (iteration % 10000 == 0):
            print iteration, (QupperMBAE[start_state][acList[1]] -
                              QlowerMBAE[start_state][acList[0]]) / epsilon
            np.savetxt(ff, sampled_frequency_s_a, delimiter=',')
            ff.write('\n')
            # print QupperMBAE
            # print iteration

        #### Check epsilon condition for all the states
        # for st in range(mdp.numStates):
        # 	acList = bestTwoActions(mdp, st, Qstar, QupperMBAE, Qstar)
        # 	# print "Comparing ",QupperMBAE[st][acList[1]], QlowerMBAE[st][acList[0]]

        # 	if(QupperMBAE[st][acList[1]]-QlowerMBAE[st][acList[0]]<=epsilon):
        # 		# print "Setting action ", acList[0], "for state ", st
        # 		final_policy[st]=acList[0]
        # 		count+=1

        ##### Updating the list of coliliding states
        states_to_sample = []
        for st in range(mdp.numStates):
            acList = bestTwoActions(mdp, st, Qlower, Qupper, Qstar)
            # colliding_values[st] = QupperMBAE[st][acList[1]]-QlowerMBAE[st][acList[0]]-epsilon
            ##### Changing stopping condition to epsilon*(1-gamma)/2
            colliding_values[st] = Qupper[st][acList[1]] - Qlower[st][
                acList[0]] - epsilon * (1 - mdp.discountFactor) / 2
            # print colliding_values[st]
            if (colliding_values[st] > 0):
                ### this state is still colliding, add to sample states
                states_to_sample.append(st)

        #### Check epsilon condition for only starting state
        if (not (start_state in states_to_sample)):
            # if(count==mdp.numStates):
            acList = bestTwoActions(mdp, start_state, Qlower, Qupper, Qstar)
            print "Setting final_policy of ", start_state, " to", acList[0]
            final_policy[start_state] = acList[0]
            print "Iterations taken : ", iteration
            print "Returning the policy :", final_policy
            for i in range(mdp.numStates):
                if (final_policy[i] == -1):
                    final_policy[i] = bestTwoActions(mdp, i, Qlower, Qupper,
                                                     Qstar)[0]
            return final_policy

        h += 1

    for i in range(mdp.numStates):
        if (final_policy[i] == -1):
            final_policy[i] = bestTwoActions(mdp, i, Qlower, Qupper, Qstar)[0]
    return final_policy
Пример #9
0
def markovchain(mdp,
                start_state=0,
                epsilon=4,
                randomseed=None,
                algo="episodic",
                delta=0.1,
                bounds="MBAE"):

    if (randomseed is not None):
        np.random.seed(randomseed)
    policies = np.array(getPolicies(mdp.numStates, mdp.numActions))
    numPolicies = len(policies)
    print numPolicies
    H = int((math.log(mdp.Vmax) + math.log(6.0 / epsilon)) /
            (1 - mdp.discountFactor))

    print "Chosen value of H is : ", H

    ## Initializations
    it = 0
    samples = 0
    initial_iterations = 1 * mdp.numStates * mdp.numActions
    R_s_a = np.zeros((mdp.numStates, mdp.numActions))
    N_s_a_sprime = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates),
                            dtype=np.int)
    N_s_a = np.zeros((mdp.numStates, mdp.numActions), dtype=np.int)
    P_s_a_sprime = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates))
    Qupper = mdp.Vmax * np.ones((numPolicies, mdp.numStates))
    QupperMBAE = mdp.Vmax * np.ones((numPolicies, mdp.numStates))
    Qlower = np.zeros((numPolicies, mdp.numStates))
    Qstar = (mdp.Vmax / 2) * np.ones((numPolicies, mdp.numStates))
    QstarMBAE = (mdp.Vmax / 2) * np.ones((numPolicies, mdp.numStates))
    QlowerMBAE = np.zeros((numPolicies, mdp.numStates))
    P_tilda = np.zeros((numPolicies, mdp.numStates, mdp.numStates))
    P_lower_tilda = np.zeros((numPolicies, mdp.numStates, mdp.numStates))
    VlowerMBAE = np.zeros((numPolicies, mdp.numStates))
    VupperMBAE = mdp.Vmax * np.ones((numPolicies, mdp.numStates))
    Vstar = (mdp.Vmax / 2) * np.ones((numPolicies, mdp.numStates))
    discovered_states = set([start_state])
    deltadeltaV = np.zeros((mdp.numStates))
    state_dist = np.zeros((mdp.numStates))
    state_dist[start_state] = 1

    while it < initial_iterations:
        for state in range(mdp.numStates):
            for act in range(mdp.numActions):
                it = it + 1
                ss, rr = mdp.simulate(state, act)
                R_s_a[state][act] = (rr + R_s_a[state][act] * N_s_a[state][act]
                                     ) / (N_s_a[state][act] + 1)
                N_s_a[state][act] = N_s_a[state][act] + 1
                N_s_a_sprime[state][act][ss] = N_s_a_sprime[state][act][ss] + 1
                for s2 in range(mdp.numStates):
                    P_s_a_sprime[state][act][s2] = (float)(
                        N_s_a_sprime[state][act][s2]) / N_s_a[state][act]
    samples += initial_iterations

    if (algo == "use_ddv"):
        ff = open(mdp.filename + '-markovddv' + str(randomseed) + '.txt', 'wb')
    elif (algo == "episodic"):
        ff = open(mdp.filename + '-markoveps' + str(randomseed) + '.txt', 'wb')
    elif (algo == "uniform"):
        ff = open(mdp.filename + '-markovuni' + str(randomseed) + '.txt', 'wb')
    elif (algo == "greedyMBAE"):
        ff = open(mdp.filename + '-markovMBAE' + str(randomseed) + '.txt',
                  'wb')
    elif (algo == "greedyMBIE"):
        ff = open(mdp.filename + '-markovMBIE' + str(randomseed) + '.txt',
                  'wb')
    elif (algo == "mybest"):
        ff = open(mdp.filename + '-markovbest' + str(randomseed) + '.txt',
                  'wb')

    while samples < MAX_ITERATION_LIMIT:

        p = 0
        current_policy = fixedPolicy

        for i in range(mdp.numStates):
            # print "For state ", i, " doing UpperP"
            if (N_s_a[i][current_policy[i]] > 0):
                P_tilda[p][i] = UpperP(i, current_policy[i], delta,
                                       N_s_a_sprime[i][current_policy[i]],
                                       mdp.numStates, Qupper[p], False)
                P_lower_tilda[p][i] = LowerP(
                    i, current_policy[i], delta,
                    N_s_a_sprime[i][current_policy[i]], mdp.numStates,
                    Qlower[p], False)

        Qupper[p] = itConvergencePolicy(Qupper[p],
                                        getRewards(R_s_a, current_policy),
                                        P_tilda[p], mdp.discountFactor,
                                        epsilon, converge_iterations,
                                        epsilon_convergence)
        Qlower[p] = itConvergencePolicy(Qlower[p],
                                        getRewards(R_s_a, current_policy),
                                        P_lower_tilda[p], mdp.discountFactor,
                                        epsilon, converge_iterations,
                                        epsilon_convergence)
        Qstar[p] = itConvergencePolicy(Qstar[p],
                                       getRewards(R_s_a, current_policy),
                                       getProb(P_s_a_sprime, current_policy),
                                       mdp.discountFactor, epsilon,
                                       converge_iterations,
                                       epsilon_convergence)

        for internal in range(converge_iterations):

            oldQlowerMBAE = np.copy(QlowerMBAE[p][start_state])
            for state in range(mdp.numStates):
                act = current_policy[state]
                firstterm = R_s_a[state][act]
                secondterm = mdp.discountFactor * np.sum(
                    VupperMBAE[p] * (P_s_a_sprime[state][act]))
                lower_secondterm = mdp.discountFactor * np.sum(
                    VlowerMBAE[p] * (P_s_a_sprime[state][act]))
                star_secondterm = mdp.discountFactor * np.sum(
                    Vstar[p] * (P_s_a_sprime[state][act]))
                thirdterm = mdp.Vmax * math.sqrt(
                    (math.log(c * (samples**2) * mdp.numStates * 1) -
                     math.log(delta)) / N_s_a[state][act])
                QupperMBAE[p][state] = firstterm + secondterm + thirdterm
                QlowerMBAE[p][state] = firstterm + lower_secondterm - thirdterm
                QstarMBAE[p][state] = firstterm + star_secondterm
                VupperMBAE[p][state] = QupperMBAE[p][state]
                VlowerMBAE[p][state] = QlowerMBAE[p][state]
                Vstar[p][state] = QstarMBAE[p][state]
            if (np.linalg.norm(oldQlowerMBAE - QlowerMBAE[p][start_state]) <=
                    epsilon_convergence):
                break
        policy1Index = 0

        h = 0
        policy1 = fixedPolicy
        state = start_state
        # print "samples", samples
        if (samples % 1000) < 100:
            if (verbose == 0):
                ff.write(str(samples))
                ff.write('\t')
                if (plot_vstar):
                    ff.write(str(Vstar[policy1Index][start_state]))
                else:
                    ff.write(
                        str(QupperMBAE[policy1Index][start_state] -
                            QlowerMBAE[policy1Index][start_state])
                    )  #-epsilon*(1-mdp.discountFactor)/2
                print samples, QupperMBAE[policy1Index][
                    start_state] - QlowerMBAE[policy1Index][start_state]
                ff.write('\n')
            else:
                print samples
                print QupperMBAE[:, start_state], QlowerMBAE[:, start_state]

        polList = [policy1Index]

        if (algo == "use_ddv"):
            ## Caclulate V for all states
            for pnum in polList:
                policiesfddv = fixedPolicy
                # print "Getting DDV values"
                for st in list(discovered_states):
                    ac = policiesfddv[st]
                    #### Compute del del V
                    deltadeltaV[st] = CalculateDelDelV(
                        st, ac, mdp, N_s_a_sprime, QupperMBAE[pnum],
                        QlowerMBAE[pnum], None, None, start_state,
                        P_s_a_sprime, P_tilda[pnum], P_lower_tilda[pnum],
                        R_s_a, epsilon, delta, converge_iterations,
                        epsilon_convergence, policiesfddv)

                # print deltadeltaV
                cs = np.argmax(deltadeltaV)
                ca = policiesfddv[cs]
                # print deltadeltaV, cs, ca
                # print deltadeltaV, policy1, policy2
                # print "Found max state for DDV: ",cs,ca
                # time.sleep(0.1)
                ss, rr = mdp.simulate(cs, ca)
                # print "Policy is ", policiesfddv
                # print "Sampling ", cs, ca

                time.sleep(0.1)
                samples = samples + 1
                discovered_states.add(ss)
                R_s_a[cs][ca] = (rr + R_s_a[cs][ca] * N_s_a[cs][ca]) / (
                    N_s_a[cs][ca] + 1)
                N_s_a[cs][ca] += 1
                N_s_a_sprime[cs][ca][ss] += 1
                # P_s_a_sprime = np.copy(N_s_a_sprime)
                for s2 in range(mdp.numStates):
                    P_s_a_sprime[cs][ca][s2] = (float)(
                        N_s_a_sprime[cs][ca][s2]) / N_s_a[cs][ca]
        elif (algo == "episodic"):
            while h < H:
                act = policy1[state]
                # print "------>",current_state, current_action
                ss, rr = mdp.simulate(state, act)
                # print "Sampling ", state, act
                samples += 1
                R_s_a[state][act] = (rr + R_s_a[state][act] * N_s_a[state][act]
                                     ) / (N_s_a[state][act] + 1)
                N_s_a[state][act] += 1
                N_s_a_sprime[state][act][ss] += 1
                # P_s_a_sprime = np.copy(N_s_a_sprime)
                for s2 in range(mdp.numStates):
                    P_s_a_sprime[state][act][s2] = (float)(
                        N_s_a_sprime[state][act][s2]) / N_s_a[state][act]
                state = ss
                h += 1
        elif (algo == "uniform"):
            for st in range(mdp.numStates):
                ac = fixedPolicy[st]
                ss, rr = mdp.simulate(st, ac)
                # print "Sampling ", st, ac
                samples += 1
                R_s_a[st][ac] = (rr + R_s_a[st][ac] * N_s_a[st][ac]) / (
                    N_s_a[st][ac] + 1)
                N_s_a[st][ac] += 1
                N_s_a_sprime[st][ac][ss] += 1
                for s2 in range(mdp.numStates):
                    P_s_a_sprime[st][ac][s2] = (float)(
                        N_s_a_sprime[st][ac][s2]) / N_s_a[st][ac]
        elif (algo == "greedyMBAE"):

            st = max(range(mdp.numStates),
                     key=lambda x: VupperMBAE[0][x] - VlowerMBAE[0][x])
            ac = fixedPolicy[st]
            ss, rr = mdp.simulate(st, ac)
            # print "Sampling ", st, ac
            samples += 1
            R_s_a[st][ac] = (rr + R_s_a[st][ac] * N_s_a[st][ac]) / (
                N_s_a[st][ac] + 1)
            N_s_a[st][ac] += 1
            N_s_a_sprime[st][ac][ss] += 1
            for s2 in range(mdp.numStates):
                P_s_a_sprime[st][ac][s2] = (float)(
                    N_s_a_sprime[st][ac][s2]) / N_s_a[st][ac]
        elif (algo == "greedyMBIE"):

            st = max(range(mdp.numStates),
                     key=lambda x: Qupper[0][x] - Qlower[0][x])
            ac = fixedPolicy[st]
            ss, rr = mdp.simulate(st, ac)
            # print "Sampling ", st, ac
            samples += 1
            R_s_a[st][ac] = (rr + R_s_a[st][ac] * N_s_a[st][ac]) / (
                N_s_a[st][ac] + 1)
            N_s_a[st][ac] += 1
            N_s_a_sprime[st][ac][ss] += 1
            for s2 in range(mdp.numStates):
                P_s_a_sprime[st][ac][s2] = (float)(
                    N_s_a_sprime[st][ac][s2]) / N_s_a[st][ac]
        elif (algo == "mybest"):
            if (samples % 10000 < 50):
                state_dist = np.zeros((mdp.numStates))
                state_dist[start_state] = 1
            N = getSampleCount(state_dist, N_s_a_sprime,
                               QupperMBAE[policy1Index],
                               QlowerMBAE[policy1Index],
                               QstarMBAE[policy1Index])
            # print N
            for i in range(N):
                # print state_dist, samples, P_s_a_sprime
                # import pdb; pdb.set_trace()
                st = np.random.choice(np.arange(mdp.numStates), p=state_dist)
                ac = fixedPolicy[st]
                ss, rr = mdp.simulate(st, ac)
                # print "Sampling ", st, ac
                samples += 1
                R_s_a[st][ac] = (rr + R_s_a[st][ac] * N_s_a[st][ac]) / (
                    N_s_a[st][ac] + 1)
                N_s_a[st][ac] += 1
                N_s_a_sprime[st][ac][ss] += 1
                for s2 in range(mdp.numStates):
                    P_s_a_sprime[st][ac][s2] = (float)(
                        N_s_a_sprime[st][ac][s2]) / N_s_a[st][ac]
            state_dist = prob_step(state_dist, P_s_a_sprime, fixedPolicy)

        if (samples % 1000) < 100:
            if (QupperMBAE[policy1Index][start_state] -
                    QlowerMBAE[policy1Index][start_state] - epsilon *
                (1 - mdp.discountFactor) / 2 < 0):
                print Qupper[policy1Index][start_state], Qstar[policy1Index][
                    start_state], epsilon * (1 - mdp.discountFactor) / 2
                print "Epsilon condition reached at ", samples, " samples"
                return fixedPolicy
            else:
                # print QupperMBAE[policy2Index][start_state],QstarMBAE[policy1Index][start_state],epsilon*(1-mdp.discountFactor)/2
                pass
            # print "ends here"

    ff.close()
    return policy1