Exemplo n.º 1
0
def main():
#policy learning	
	global numEpisodes 
	global returnSum 
	global emu
	global epi
	global alpha
	global num_states
	global num_actions

	for i in range(6):	
		print "i:",i
		if i==0:alpha,emu,epi=0.001,0.05,0.01  
		if i==1:alpha,emu,epi=0.001,0.1,0.01
		if i==2:alpha,emu,epi=0.001,0.2,0.01
		if i==3:alpha,emu,epi=0.001,0.3,0.01
		if i==4:alpha,emu,epi=0.001,0.4,0.01
		if i==5:alpha,emu,epi=0.001,0.6,0.01
		print alpha,emu,epi
		for _ in range(numEpisodes):
			if _%10000==0:
				print "Episode:",_
				print "Average return:",returnSum/(_+1)
			s=bj.init()
			rand=np.random.random()
			if rand<emu:
		#		print "rand:",rand,"policy: rand"
				rand_policy(s)
			else:
				exp_sarsa(s)
		#		print "rand:",rand,"policy: ExpectedSarsa"
		
		bj.printPolicy(sarsa_policy)
		print "Average return:",returnSum/numEpisodes
		
		#determinstic policy
		returnSum=0.0
		numEpisodes=int(math.pow(10,7))
		for _ in range(numEpisodes):
			if _%10000==0:
				print "Episode:",_
				print "Average return:",returnSum/(_+1)
			s=bj.init()
			deter_policy(s)
		bj.printPolicy(sarsa_policy)
		print "Average return:",returnSum/numEpisodes
		print "alpha, emu, epi, episodes:",alpha,emu,epi,numEpisodes
		info[i]=returnSum/numEpisodes,alpha,emu,epi,numEpisodes

	print info	
Exemplo n.º 2
0
def showOneGame():
	s=blackjack.init()
	moves=[0,1,0] 
	turn=0
	while s!=-1: #-1 is terminal
		a=moves[turn]
		r,sp=blackjack.sample(s,a)
		print("turn %d: s %d a %d -> r %d sp %d "%(turn,s,a,r,sp),end="")
		print("\t Player Sum: %d  Dealer Card: %d  Usable Ace: %d"%(blackjack.playerSum,blackjack.dealerCard, blackjack.usableAce))
		s=sp
		turn+=1
	return None
Exemplo n.º 3
0
def runLearnedPolicy():
	G = 0
	# Init the game of blackjack and get the initial state
	s = blackjack.init()
	#Consider each step of the episode
	while s!=-1: #-1 is terminal
		# Take the action given by learned policy
		a = getLearnedPolicy(s)
		r,sp = blackjack.sample(s,a)
		G += r
		s=sp
	return G
Exemplo n.º 4
0
def showOneGame():
    G = 0
    s=blackjack.init()
    turn=0
    while s!=-1: #-1 is terminal
        a=randint(0,1)
        r,sp=blackjack.sample(s,a)
        print("turn %d: s %d a %d -> r %d sp %d "%(turn,s,a,r,sp),end="")
        print("\t Player Sum: %d  Dealer Card: %d  Usable Ace: %d"%(blackjack.playerSum,blackjack.dealerCard, blackjack.usableAce))
        turn+=1
        s=sp
        G=G+r
    return G
def showOneGame():
    s=blackjack.init()
    moves=[0,1] 
    turn=0
    Reward_sum = 0
    while s!=-1: #-1 is terminal
        a= moves[turn]
        r,sp=blackjack.sample(s,a)
        #print("turn %d: s %d a %d -> r %d sp %d "%(turn,s,a,r,sp),end="")
        #print("\t Player Sum: %d  Dealer Card: %d  Usable Ace: %d"%(blackjack.playerSum,blackjack.dealerCard, blackjack.usableAce))
        s=sp
        turn=random.randint(0,1)
        Reward_sum +=r
    return Reward_sum
Exemplo n.º 6
0
def Qlearning(ex):
  
  # Initial the state (deal the first card )
  s=blackjack.init()
  segma_r = 0
  while s!=-1:                                      # -1 is terminal
    a = argmax([ex[s,0], ex[s,1]])                 # Choose argmax(Q(s,a))   
    if random.uniform(0,1) < epsilon/2:            # e-greedy
      a = abs(a-1)    
      
    # Q(s,a) <- Q(s,a) + alpha * (r + argmax(Q(sp,a)) - Q(s,a))
    
    r,sp=blackjack.sample(s,a)                      # Get the reward and s'
    ex[s,a] += alpha * (r - ex[s,a] + ex[sp,argmax([ex[sp,0],ex[sp,1]])])  
    s=sp; segma_r += r                              # Return the value and next state
  return segma_r                                   
Exemplo n.º 7
0
        def qLearning(self):
          for i in range(1,181):
            randomValue1 = random.random()
            randomValue2 = random.random()
            
            randomValue1 = randomValue1 * 0.00001
            randomValue2 = randomValue2 * 0.00001
            self.q[i][0] = randomValue1
            self.q[i][1] = randomValue2
          
          iterations = 0
          returnSum = 0
          while iterations < self.MAXITERATIONS:      
            s = blackjack.init()
            reward, state = blackjack.sample(s,1)
            if state == -1:
              returnSum = returnSum+reward
            while state != -1:
              A = self.eGreedy(self.q,state)
              reward, statePrime = self.giveSample(state, A)
              returnSum = returnSum + reward
              if reward == 0 and statePrime != -1:
                theMaxAction = self.maxAction(self.q, statePrime)
                newStateMaxQSA = self.q[statePrime][theMaxAction]
              else:
                newStateMaxQSA = 0
              
              if self.ALPHA == "Dynamic":
                      #print("YES")
                 ALPHA = self.getDynamicAlpha(state,A)
              else:
                 ALPHA = self.ALPHA

              bracketValue = reward+(self.GAMMA*newStateMaxQSA)-self.q[state][A]
              self.q[state][A] = self.q[state][A]+ALPHA*(bracketValue)  
              state = statePrime
            
            iterations = iterations + 1
            if self.printEveryOneThousandEpisodes and iterations % 10000 == 0:
                print("Average Return During Learning Phase at "+str(iterations)+" is "+str(returnSum/iterations))

          
          print("The Policy learned From the Exploration Phase is : ")
          blackjack.printPolicy(self.printPolicy2)
          return returnSum/self.MAXITERATIONS
Exemplo n.º 8
0
def bjrandomPolicy(numEpisodes=10000):
    # Input: number of Episodes
    # Output: Average Return over number of episodes
    # Policy: Equally Random
    returnSum = 0.0
    for episodeNum in range(numEpisodes):
        G = 0
        # Implement equaly random probability assuming gamma = 1
        s=blackjack.init()
        while s!=-1: #-1 is terminal
            # Rand int returns a number between 0, 1
            a=random.randint(0,1)
            G,sp=blackjack.sample(s,a)
            s=sp
        print("Episode: ", episodeNum, "Return: ", G)
        returnSum = returnSum + G

    print("Average return: ", returnSum/numEpisodes)
    return None
Exemplo n.º 9
0
        def onlyExploitQ(self):          
          iterations = 0
          returnSum = 0
          
          while iterations < self.MAXITERATIONS:
            s = blackjack.init()
            reward, state = blackjack.sample(s,1)
            if state == -1:
              returnSum = returnSum+reward
            while state != -1:
              A = self.maxAction(self.q, state)
              reward, statePrime = self.giveSample(state, A)
              returnSum = returnSum + reward

              state = statePrime
            iterations = iterations + 1
            if self.printEveryOneThousandEpisodes and iterations % 10000 == 0:
                print("Average Return During Exploitation Phase at "+str(iterations)+" is "+str(returnSum/iterations))
            
          return returnSum/self.MAXITERATIONS
Exemplo n.º 10
0
from pylab import *
from numpy import *

def random_policy (list_of_actions): #returns a random action from a list of possible actions
    next_action = choice(list_of_actions)
    #print next_action
    return next_action

numEpisodes = 10000

returnSum = 0.0
actions = [0,1]


for episodeNum in range(numEpisodes):
    s = blackjack.init();
    G = 0
    while (s != -1):
    	a = random_policy (actions)
        result = blackjack.sample (s,a)
        #print blackjack.sample (0, 1)
        G = G + result[0]
        s = result[1]
    print "Episode: ", episodeNum, "Return: ", G
    returnSum = returnSum + G

print "Average return: ", returnSum/numEpisodes


printPolicy(Q)
	
Exemplo n.º 11
0
    if (state == -1):
        return 0
    elif (testnumber <= epsilonpi):
        return (0.5 * Q[state][0] + 0.5 * Q[state][1])
    else:
        return Q[state][np.argmax(Q[state])]


"""
Experiments:

First learn policy and calculate average return
"""

for episodeNum in range(numEpisodes):
    blackjack.init()
    state = 0
    return1 = 0
    while (state != -1):
        action = policy(state)
        reward, statep = blackjack.sample(state, action)
        Q[state][action] = Q[state][action] + alpha * (
            reward + expectedValue(statep) - Q[state][action])
        state = statep
        return1 += reward
    returnSum += return1
    if (((episodeNum % 10000) == 0) and (episodeNum != 0)):
        print "Count =", episodeNum, "Average return: ", returnSum / (
            episodeNum)

blackjack.printPolicy(learnedPolicy)
Exemplo n.º 12
0
import blackjack
from pylab import *
from random import *

numEpisodes = 2000

returnSum = 0.0
for episodeNum in range(numEpisodes):
    G =0
    black = blackjack.init()
    action =[0,1]
    while black!=-1:
        num = randint(0,1)
        n,black = blackjack.sample(black,action[num])
        
        G+=n

    print "Episode: ", episodeNum, "Return: ", G
    returnSum = returnSum + G
print "Average return: ", returnSum/numEpisodes
Exemplo n.º 13
0
numEpisodes = 1000000
e = 1
alpha = 0.001
returnSum = 0.0

Q = [[0,0]]*183
#for i in Q:
 #   i[0],i[1]= uniform(0,1),uniform(0,1)

for episodeNum in range(numEpisodes):
    G = 0
    # my code starts here
    a=0
    
    S =blackjack.init()
    t = Q[S]
    if (e > randint(0, 2)):
        R,S_ =blackjack.sample(S,randint(0,2))
    else:
        if t[0]>t[1]:
            R,S_ = blackjack.sample(S,0)
            a=0
        else:
            R,S_ = blackjack.sample(S,1)
            a=1
    while (S_!=(-1)):
        Q[S][a] = Q[S][a] + alpha*(R + Q[S_][0]+Q[S_][1]-Q[S][a])
        S=S_
        t = Q[S]
        if (e > randint(0, 2)):
Exemplo n.º 14
0
import blackjack
from pylab import *

numEpisodes = 2000

returnSum = 0.0
for episodeNum in range(numEpisodes):
    G = 0
    # my code starts here
    
    
    R,S = blackjack.sample(blackjack.init(),(randint(0, 2)))
    if (S==(-1)): R=1
    while (S!=(-1)):
        R,S = blackjack.sample(S,(randint(0, 2)))    
    G = R    
    
            
    print "Episode: ", episodeNum, "Return: ", G 
    returnSum = returnSum + G
print "Average return: ", returnSum/numEpisodes
Exemplo n.º 15
0
epsilonMu = 0.2
epsilonPi = 0.0
alpha = 0.0005
discount = 1
epsilon = epsilonMu


def returnPolicy(state):
    return n.argmax(states[state])


returnSum = 0.0
for episodeNum in range(numEpisodes):
    G = 0
    #Start a new game of blackjack
    currentState = blackjack.init()
    #Continue this game until the terminal state is reached
    while (currentState != -1):
        #Get a random number between 0 and 1, if its less than epsilon behavior, then explore

        rnumber = n.random.rand()
        if rnumber < epsilon:
            action = n.random.randint(2)
        else:
            #If not exploring, pick the highest action at state S
            action = returnPolicy(currentState)

#Get the next state, get reward and next state
        next = blackjack.sample(currentState, action)
        reward = next[0]
        nextstate = next[1]
Exemplo n.º 16
0
import blackjack
from pylab import *

numEpisodes = 2000

returnSum = 0.0
for episodeNum in range(numEpisodes):
	G = 0
	currentstate = blackjack.init()
	while(currentstate != -1):
		action = randint(2) #randomly pick the action
		next = blackjack.sample(currentstate, action)
		G = G + next[0]
		currentstate = next[1]
	print "Episode: ", episodeNum, "Return: ", G
	returnSum = returnSum + G
print "Average return: ", returnSum/numEpisodes
Exemplo n.º 17
0
#Function to find the probability of a given action given the policy
def actionProb(e,a,s):
    if np.argmax(Q[s]) == a:
        return 1 - e + e/num_actions
    else:
        return e/num_actions


#Learning the policy through the Expected Sarsa algorithm
Q =  0.00001*np.random.rand(num_states,num_actions)

for episodeNum in range(numEpisodes):
    G = 0

    s = bj.init()
    while s != -1:
        a = np.random.choice(2, p=[actionProb(emu,0,s),actionProb(emu,1,s)])
        r, s1 = bj.sample(s,a)
        Q[s,a] = Q[s,a] + alpha*(r + actionProb(epi,0,s1)*Q[s1,0] + actionProb(epi,1,s1)*Q[s1,1] - Q[s,a])
        s = s1
        G+=r

    returnSum = returnSum + G

    if episodeNum%10000 == 0:
        print "Episode: ", episodeNum
        print "Average return: ", returnSum/(episodeNum+1)

#Function for the learned policy
def learnedPolicy(s):
Exemplo n.º 18
0
numEpisodes = evaluationEpisodes + learningEpisodes

states = 0.00001*n.random.rand(181,2)
epsilonMu = 0.2
epsilonPi = 0.0
alpha = 0.0005
discount = 1
epsilon = epsilonMu

def returnPolicy(state): return n.argmax(states[state])

returnSum = 0.0
for episodeNum in range(numEpisodes):
    G = 0
    #Start a new game of blackjack
    currentState = blackjack.init()
    #Continue this game until the terminal state is reached
    while(currentState != -1):
        #Get a random number between 0 and 1, if its less than epsilon behavior, then explore
       
        rnumber = n.random.rand()
        if rnumber < epsilon:
            action = n.random.randint(2)
        else:
	    #If not exploring, pick the highest action at state S
            action = returnPolicy(currentState)      
	
	#Get the next state, get reward and next state
        next = blackjack.sample(currentState, action)
        reward = next[0]
        nextstate = next[1]
Exemplo n.º 19
0
	if (state == -1):
		return 0
	elif (testnumber <= epsilonpi):
		return (0.5*Q[state][0] + 0.5*Q[state][1])
	else:
		return Q[state][np.argmax(Q[state])]


"""
Experiments:

First learn policy and calculate average return
"""

for episodeNum in range(numEpisodes):
	blackjack.init()
	state=0
	return1=0
	while (state != -1):
		action = policy(state)
		reward,statep=blackjack.sample(state,action) 
		Q[state][action] = Q[state][action] + alpha*(reward + expectedValue(statep) - Q[state][action])
		state = statep
		return1+=reward
	returnSum+=return1
	if (((episodeNum % 10000) == 0) and (episodeNum != 0)):
		print "Count =",episodeNum,"Average return: ", returnSum/(episodeNum)