def run(numEvaluationEpisodes): returnSum = 0.0 for episodeNum in range(numEvaluationEpisodes): G = 0 gamma = 1 # Choose a random action - 1 or 0 action = random.randint(0, 1) # initial state at the start of a new blackjack game state = blackjack.init() # Next state and reward from current state and action stateAction = blackjack.sample(state, action) reward = stateAction[0] state = stateAction[1] G = G + reward #print("reward1, newstate", reward, state) # go through every state in the game of blackJ while state != False: # Choose a random action - 1 or 0 action = random.randint(0, 1) # Next state and reward from current state and action stateAction = blackjack.sample(state, action) reward = stateAction[0] state = stateAction[1] #print("reward, newstate", reward, state) G = G + reward #print("Episode: ", episodeNum, "Return: ", G) returnSum = returnSum + G return returnSum / numEvaluationEpisodes #run(numEvaluationEpisodes)
def evaluate(numEvaluationEpisodes): returnSum = 0.0 for episodeNum in range(numEvaluationEpisodes): G = 0 s = blackjack.init() # initial state # Choose an action with greedy policy a = policy(s) # Next state and reward from current state and action stateAction = blackjack.sample(s, a) newState = stateAction[1] reward = stateAction[0] G = G + reward # go through every state in the game of blackJ while newState != False: a = policy(newState) # Next state and reward from current state and action stateAction = blackjack.sample(newState, a) reward = stateAction[0] newState = stateAction[1] G = G + reward # Use deterministic policy from Q1 and Q2 to run a number of # episodes without updates. Return average return of episodes. returnSum = returnSum + G return returnSum / numEvaluationEpisodes
def learn(alpha, eps, numTrainingEpisodes): returnSum = 0.0 for episodeNum in range(numTrainingEpisodes): G = 0 S = blackjack.init() R, S = blackjack.sample(S, 1) G += R while (S): Q = Q1[S,:]+Q2[S,:] prob1 = np.random.random() if prob1 < eps: # explore A = np.random.choice([0, 1]) else: # greedy A = Q.argmax() R, S_prime = blackjack.sample(S, A) G += R S_prime = int(S_prime) prob2 = np.random.choice([1, 2]) if prob2 == 1: Q1[S, A] = Q1[S, A] + alpha * ( R + GAMMA * Q2[S_prime, (Q1[S_prime]).argmax()] - Q1[S, A]) else: Q2[S, A] = Q2[S, A] + alpha * (R + GAMMA * Q1[S_prime, (Q2[S_prime]).argmax()] - Q2[S, A]) S = S_prime #print("Episode: ", episodeNum, "Return: ", G) returnSum = returnSum + G if episodeNum % 100000 == 0 and episodeNum != 0: print("Average return so far: ", returnSum/episodeNum)
def run(numEvaluationEpisodes): returnSum = 0.0 for episodeNum in range(numEvaluationEpisodes): G = 0 R, S = blackjack.sample(0, 1) if S == False: G = G + R else: while S != False: #if state a!= terminal state R, S = blackjack.sample(S, randint(0, 2)) G = G + R print("Episode: ", episodeNum, "Return: ", G) returnSum = returnSum + G return returnSum / numEvaluationEpisodes
def evaluate(numEvaluationEpisodes): returnSum = 0.0 for episodeNum in range(numEvaluationEpisodes): G = 0 S = blackjack.init() R, S = blackjack.sample(S, 1) G += R while (S): Q = Q1[S, :] + Q2[S, :] A = Q.argmax() R, S = blackjack.sample(S, A) G += R returnSum = returnSum + G return returnSum / numEvaluationEpisodes
def run(numEvaluationEpisodes): returnSum = 0.0 for episodeNum in range(numEvaluationEpisodes): G = 0 S = blackjack.init() A = [1, 0] a = np.random.choice(A) R, S = blackjack.sample(S, a) G += R while (S): a = np.random.choice(A) R, S = blackjack.sample(S, a) G += R #print("Episode: ", episodeNum, "Return: ", G) returnSum = returnSum + G return returnSum / numEvaluationEpisodes
def deter_policy(state): global returnSum s=state while s!=-1: r,s_=bj.sample(s,sarsa_policy(s)) s=s_ returnSum=returnSum + r
def evaluateEpisode(G, eps): currentState = blackjack.init() # returns the initial state while(True): # repeate for each step (reward, currentState) = blackjack.sample(currentState, epsGreedyPolicy(currentState, eps)) G += reward if(not currentState): # if currentState is false (we know its the end of the episode) return G
def learn(alpha, eps, numTrainingEpisodes): returnSum = 0.0 for episodeNum in range(numTrainingEpisodes): s = blackjack.init() G = 0 discount = 1 while (s is not False): if np.random.randint(0, 101) >= eps * 100: if ((Q1[0, s] + Q2[0, s]) >= (Q1[1, s] + Q2[1, s])): a = 0 else: a = 1 else: a = np.random.randint(0, 2) #a = 1 # Q1 + Q2 egreedy argmax(sum Q1+Q2) r, s1 = blackjack.sample(s, a) if np.random.randint(0, 2) == 1: c = discount * Q2[np.argmax(Q2, 0)[s1], s1] Q1[a, s] = Q1[a, s] + alpha * (r + c - Q1[a, s]) else: c = discount * Q1[np.argmax(Q1, 0)[s1], s1] Q2[a, s] = Q2[a, s] + alpha * (r + c - Q2[a, s]) #Q2 <- Q2 + alpha(R + discount (S', argmaxQ1(S',a)) - Q2) #G+=r s = s1
def learn(alpha, eps, numTrainingEpisodes): returnSum = 0.0 for episodeNum in range(numTrainingEpisodes): G = 0 currentState = blackjack.init() terminate = False while not terminate: if randint(0, 101) > eps * 100: #greedy action action = argmax(theta1[currentState] + theta2[currentState]) else: #Epsilon action (explore) action = randint(0, 2) G, nextState = blackjack.sample(currentState, action) if randint(0, 2) == 0: #0.5 probability theta1[currentState, action] = theta1[currentState, action] + alpha * ( G + theta2[nextState, argmax(theta1[nextState])] - theta1[currentState, action]) else: #0.5 probability theta2[currentState, action] = theta2[currentState, action] + alpha * ( G + theta1[nextState, argmax(theta2[nextState])] - theta2[currentState, action]) currentState = nextState if not nextState: terminate = True #print("Episode: ", episodeNum, "Return: ", G) returnSum = returnSum + G
def learn(alpha, eps, numTrainingEpisodes): # Fill in Q1 and Q2 gamma = 1 for i in range(1, 181): Q1[i][0] = random() * 0.00001 Q1[i][1] = random() * 0.00001 Q2[i][0] = random() * 0.00001 Q2[i][1] = random() * 0.00001 returnSum = 0 for episodeNum in range(numTrainingEpisodes): state = blackjack.init() G = 0 R, S = blackjack.sample(state, 1) if S == False: G = G + R Q = Q1 + Q2 while S != False: if eps > random(): A = randint(0, 2) #esp bigger then do random action else: #else choose the biggest one in perious if Q[S, 0] > Q[S, 1]: A = 0 else: A = 1 RR, nexstate = blackjack.sample(S, A) G = G + RR pro = randint(0, 2) if pro == 1: error = RR + gamma * Q2[nexstate][argmax( Q1[nexstate])] - Q1[S][A] Q1[S][A] = Q1[S][A] + alpha * (error) else: error = RR + gamma * Q1[nexstate][argmax( Q2[nexstate])] - Q2[S][A] Q2[S][A] = Q2[S][A] + alpha * (error) S = nexstate # Fill in Q1 and Q2 returnSum = returnSum + G
def rand_policy(state): global returnSum s=state while s!=-1: a=np.random.randint(0,2) r,s_=bj.sample(s,a) Q[s,a]=Q[s,a]+alpha*(r+0.5*Q[s_,0]+0.5*Q[s_,1]-Q[s,a]) s=s_ returnSum=returnSum + r
def learn(alpha, eps, numTrainingEpisodes): returnSum = 0.0 for episodeNum in range(numTrainingEpisodes): S = blackjack.init() G = 0 A = 0 R, S = blackjack.sample(S, A) G += R # ACCOUNTS FOR THE NATURAL (INSTANT WIN OR DRAW) # iterate for each step of the episode while S: if np.random.random() > eps: if Q1[S][0] + Q2[S][0] >= Q1[S][1] + Q2[S][1]: A = 0 R, nS = blackjack.sample(S, A) elif Q1[S][0] + Q2[S][0] < Q1[S][1] + Q2[S][1]: A = 1 R, nS = blackjack.sample(S, A) else: A = np.random.randint(0, 2) R, nS = blackjack.sample(S, A) # 0.5 probability of doing Q1 or Q2 prob = np.random.randint(0, 2) if not nS: if prob == 1: Q1[S][A] = Q1[S][A] + alpha * (R - Q1[S][A]) else: Q2[S][A] = Q2[S][A] + alpha * (R - Q2[S][A]) else: if prob == 1: Q1[S][A] = Q1[S][A] + alpha * ( R + Q2[nS][np.argmax(Q1, 1)[nS]] - Q1[S][A]) else: Q2[S][A] = Q2[S][A] + alpha * ( R + Q1[nS][np.argmax(Q2, 1)[nS]] - Q2[S][A]) S = nS G += R #print("Episode: ", episodeNum, "Return: ", G) returnSum = returnSum + G if episodeNum % 10000 == 0 and episodeNum != 0: blackjack.printPolicy(policy) print("Average return so far: ", returnSum / episodeNum)
def run(numEvaluationEpisodes): returnSum = 0.0 for episodeNum in range(numEvaluationEpisodes): G = 0 #reward R = 0 #return S = blackjack.init() A = numpy.random.randint(0, 2) R, S = blackjack.sample(S, 0) G += R # loops until terminal state while S != False: A = numpy.random.randint(0, 2) R, S = blackjack.sample(S, A) # adds to the return G += R #print("Episode: ", episodeNum, "Return: ", G) returnSum = returnSum + G return returnSum / numEvaluationEpisodes
def evaluate(numEvaluationEpisodes): returnSum = 0.0 for episodeNum in range(numEvaluationEpisodes): G = 0 S = blackjack.init() A = 0 R, S = blackjack.sample(S, A) G += R while S: if Q1[S][0] + Q2[S][0] >= Q1[S][1] + Q2[S][1]: A = 0 R, S = blackjack.sample(S, A) else: A = 1 R, S = blackjack.sample(S, A) G += R # Use deterministic policy from Q1 and Q2 to run a number of # episodes without updates. Return average return of episodes. returnSum = returnSum + G return returnSum / numEvaluationEpisodes
def evaluate(numEvaluationEpisodes): returnSum = 0.0 Q = Q1 + Q2 for episodeNum in range(numEvaluationEpisodes): G = 0 R, S = blackjack.sample(0, 1) if S == False: G = G + R while S != False: #if state a!= terminal state if Q[S, 0] >= Q[S, 1]: A = 0 else: A = 1 R, S = blackjack.sample(S, A) G = G + R #print("Episode: ", episodeNum, "Return: ", G) returnSum = returnSum + G return returnSum / numEvaluationEpisodes
def runLearnedPolicy(): G = 0 # Init the game of blackjack and get the initial state s = blackjack.init() #Consider each step of the episode while s!=-1: #-1 is terminal # Take the action given by learned policy a = getLearnedPolicy(s) r,sp = blackjack.sample(s,a) G += r s=sp return G
def assessPolicy(policy, numEpisodes): returnSum = 0.0 for episodeNum in range(numEpisodes): G = 0 S = blackjack.init() while S is not False: A = policy(S) R, Sprime = blackjack.sample(S, A) G += R S = Sprime returnSum += G return returnSum / numEpisodes
def showOneGame(): s=blackjack.init() moves=[0,1,0] turn=0 while s!=-1: #-1 is terminal a=moves[turn] r,sp=blackjack.sample(s,a) print("turn %d: s %d a %d -> r %d sp %d "%(turn,s,a,r,sp),end="") print("\t Player Sum: %d Dealer Card: %d Usable Ace: %d"%(blackjack.playerSum,blackjack.dealerCard, blackjack.usableAce)) s=sp turn+=1 return None
def episode(G, discount): currentState = blackjack.init() # returns the initial state counter = 0 while (True): (reward, currentState) = blackjack.sample(currentState, chooseActionFromState(currentState)) G += (discount**counter) * reward counter += 1 # Need to inc after using it to calculate the return (G) if (not currentState ): # if currentState is false (we know its the end of the episode) return G
def run(numEvaluationEpisodes): returnSum = 0.0 for episodeNum in range(numEvaluationEpisodes): G = 0 s = blackjack.init() while (s is not False): a = np.random.randint(0, 2) r, s = blackjack.sample(s, a) G += r print("Episode: ", episodeNum, "Return: ", G) returnSum = returnSum + G return returnSum / numEvaluationEpisodes
def showOneGame(): G = 0 s=blackjack.init() turn=0 while s!=-1: #-1 is terminal a=randint(0,1) r,sp=blackjack.sample(s,a) print("turn %d: s %d a %d -> r %d sp %d "%(turn,s,a,r,sp),end="") print("\t Player Sum: %d Dealer Card: %d Usable Ace: %d"%(blackjack.playerSum,blackjack.dealerCard, blackjack.usableAce)) turn+=1 s=sp G=G+r return G
def showOneGame(): s=blackjack.init() moves=[0,1] turn=0 Reward_sum = 0 while s!=-1: #-1 is terminal a= moves[turn] r,sp=blackjack.sample(s,a) #print("turn %d: s %d a %d -> r %d sp %d "%(turn,s,a,r,sp),end="") #print("\t Player Sum: %d Dealer Card: %d Usable Ace: %d"%(blackjack.playerSum,blackjack.dealerCard, blackjack.usableAce)) s=sp turn=random.randint(0,1) Reward_sum +=r return Reward_sum
def evaluate(numEvaluationEpisodes): returnSum = 0.0 for episodeNum in range(numEvaluationEpisodes): G = 0 currentState = blackjack.init() terminate = False while not terminate: action = policy(currentState) G, nextState = blackjack.sample(currentState, action) currentState = nextState if not nextState: terminate = True returnSum = returnSum + G return returnSum / numEvaluationEpisodes
def run(numEvaluationEpisodes): returnSum = 0.0 for episodeNum in range(numEvaluationEpisodes): #loop for 2000 iterations G = 0 state = blackjack.init() while (True): # if not reach the terminal state reward, state = blackjack.sample(state, np.random.randint(0, 2)) G = G + reward if state == False: break #print("Episode: ", episodeNum, "Return: ", G) returnSum = returnSum + G return returnSum / numEvaluationEpisodes
def evaluate(numEvaluationEpisodes): returnSum = 0.0 for episodeNum in range(numEvaluationEpisodes): G = 0 s = blackjack.init() while (s is not False): if ((Q1[0, s] + Q2[0, s]) >= (Q1[1, s] + Q2[1, s])): a = 0 else: a = 1 #a = np.random.randint(0,2)#Q1+Q2 (s,1) vs Q1+Q2(s,0) r, s = blackjack.sample(s, a) G += r #print("Episode: ", episodeNum, "Return: ", G) returnSum = returnSum + G return returnSum / numEvaluationEpisodes
def Qlearning(ex): # Initial the state (deal the first card ) s=blackjack.init() segma_r = 0 while s!=-1: # -1 is terminal a = argmax([ex[s,0], ex[s,1]]) # Choose argmax(Q(s,a)) if random.uniform(0,1) < epsilon/2: # e-greedy a = abs(a-1) # Q(s,a) <- Q(s,a) + alpha * (r + argmax(Q(sp,a)) - Q(s,a)) r,sp=blackjack.sample(s,a) # Get the reward and s' ex[s,a] += alpha * (r - ex[s,a] + ex[sp,argmax([ex[sp,0],ex[sp,1]])]) s=sp; segma_r += r # Return the value and next state return segma_r
def evaluate(numEvaluationEpisodes): returnSum = 0.0 for episodeNum in range(numEvaluationEpisodes): G = 0 state = blackjack.init() while (True): # choose the action greedy wrt the sum of two action values reward, state = blackjack.sample(state, argmax(Q1[state] + Q2[state])) G = G + reward if state == False: break returnSum = returnSum + G #print ("Determinstic return after learning: ",returnSum/numEvaluationEpisodes) return returnSum / numEvaluationEpisodes
def qLearning(self): for i in range(1,181): randomValue1 = random.random() randomValue2 = random.random() randomValue1 = randomValue1 * 0.00001 randomValue2 = randomValue2 * 0.00001 self.q[i][0] = randomValue1 self.q[i][1] = randomValue2 iterations = 0 returnSum = 0 while iterations < self.MAXITERATIONS: s = blackjack.init() reward, state = blackjack.sample(s,1) if state == -1: returnSum = returnSum+reward while state != -1: A = self.eGreedy(self.q,state) reward, statePrime = self.giveSample(state, A) returnSum = returnSum + reward if reward == 0 and statePrime != -1: theMaxAction = self.maxAction(self.q, statePrime) newStateMaxQSA = self.q[statePrime][theMaxAction] else: newStateMaxQSA = 0 if self.ALPHA == "Dynamic": #print("YES") ALPHA = self.getDynamicAlpha(state,A) else: ALPHA = self.ALPHA bracketValue = reward+(self.GAMMA*newStateMaxQSA)-self.q[state][A] self.q[state][A] = self.q[state][A]+ALPHA*(bracketValue) state = statePrime iterations = iterations + 1 if self.printEveryOneThousandEpisodes and iterations % 10000 == 0: print("Average Return During Learning Phase at "+str(iterations)+" is "+str(returnSum/iterations)) print("The Policy learned From the Exploration Phase is : ") blackjack.printPolicy(self.printPolicy2) return returnSum/self.MAXITERATIONS
def learn(alpha, eps, numTrainingEpisodes): returnSum = 0.0 gamma = 1.0 for episodeNum in range(numTrainingEpisodes): G = 0 state = blackjack.init() while (True): # choose action, from a epsilon greedy num = np.random.random() if (num >= eps): action = argmax(Q1[state] + Q2[state]) else: action = np.random.randint(0, 2) # perform action if state == 0: reward, nextState = blackjack.firstSample() else: reward, nextState = blackjack.sample(state, action) # to deal with the terminal state if nextState == False: nextState = 0 if np.random.randint(0, 2): # with 0.5 probability Q1[state][action] = Q1[state][action] + alpha * ( reward + gamma * Q2[nextState][argmax(Q1[nextState])] - Q1[state][action]) else: # with 0.5 probability Q2[state][action] = Q2[state][action] + alpha * ( reward + gamma * Q1[nextState][argmax(Q2[nextState])] - Q2[state][action]) # update state state = nextState G = G + reward # update the return for state 0 with discount ratio gamma=1 if state == False: break returnSum = returnSum + G if episodeNum % 10000 == 0 and episodeNum != 0: #print("Average return so far: ", returnSum / episodeNum) pass
def learnEpisode(alpha, eps, gamma): currentState = blackjack.init() # returns the initial state episodeReturn = 0 while(True): # repeate for each step of the episode action = epsGreedyPolicy(currentState, eps) (reward, nextState) = blackjack.sample(currentState, action) episodeReturn += reward if(nextState): if(np.random.randint(0,2)): # will return ints between [0,2) Q1[currentState, action] = Q1[currentState, action] + alpha * ( reward + gamma * Q2[nextState, np.argmax(Q1[nextState])] - Q1[currentState, action]) else: Q2[currentState, action] = Q2[currentState, action] + alpha * ( reward + gamma * Q1[nextState, np.argmax(Q2[nextState])] - Q2[currentState, action]) currentState = nextState else: # we know its the terminal state so the 'next rewards' simplify to 0 and can be ommited if(np.random.randint(0,2)): # will return ints between [0,2) Q1[currentState, action] = Q1[currentState, action] + alpha * ( reward - Q1[currentState, action]) else: Q2[currentState, action] = Q2[currentState, action] + alpha * ( reward - Q2[currentState, action]) return episodeReturn # if nextState is false (we know its the end of the episode)
def bjrandomPolicy(numEpisodes=10000): # Input: number of Episodes # Output: Average Return over number of episodes # Policy: Equally Random returnSum = 0.0 for episodeNum in range(numEpisodes): G = 0 # Implement equaly random probability assuming gamma = 1 s=blackjack.init() while s!=-1: #-1 is terminal # Rand int returns a number between 0, 1 a=random.randint(0,1) G,sp=blackjack.sample(s,a) s=sp print("Episode: ", episodeNum, "Return: ", G) returnSum = returnSum + G print("Average return: ", returnSum/numEpisodes) return None
def bjrandomPolicy(numEpisodes=10000): # Input: number of Episodes # Output: Average Return over number of episodes # Policy: Equally Random returnSum = 0.0 for episodeNum in range(numEpisodes): G = 0 # Implement equaly random probability assuming gamma = 1 s = blackjack.init() while s != -1: #-1 is terminal # Rand int returns a number between 0, 1 a = random.randint(0, 1) G, sp = blackjack.sample(s, a) s = sp print("Episode: ", episodeNum, "Return: ", G) returnSum = returnSum + G print("Average return: ", returnSum / numEpisodes) return None
def exp_sarsa(state): global returnSum s=state while s!=-1: if Q[s,0]>Q[s,1]: a=0 else: a=1 r,s_=bj.sample(s,a) # a_ is the actin choosen by target epsilon greedy policy rand=np.random.random() if rand<epi: # rand a_=np.random.randint(0,2) else: # greedy a_=np.argmax(Q[s_]) Q[s,a]=Q[s,a]+alpha*(r+Q[s_,a_]-Q[s,a]) s=s_ returnSum=returnSum + r
def onlyExploitQ(self): iterations = 0 returnSum = 0 while iterations < self.MAXITERATIONS: s = blackjack.init() reward, state = blackjack.sample(s,1) if state == -1: returnSum = returnSum+reward while state != -1: A = self.maxAction(self.q, state) reward, statePrime = self.giveSample(state, A) returnSum = returnSum + reward state = statePrime iterations = iterations + 1 if self.printEveryOneThousandEpisodes and iterations % 10000 == 0: print("Average Return During Exploitation Phase at "+str(iterations)+" is "+str(returnSum/iterations)) return returnSum/self.MAXITERATIONS
import blackjack from pylab import * numEpisodes = 2000 returnSum = 0.0 for episodeNum in range(numEpisodes): G = 0 currentstate = blackjack.init() while(currentstate != -1): action = randint(2) #randomly pick the action next = blackjack.sample(currentstate, action) G = G + next[0] currentstate = next[1] print "Episode: ", episodeNum, "Return: ", G returnSum = returnSum + G print "Average return: ", returnSum/numEpisodes
import blackjack from pylab import * numEpisodes = 2000 returnSum = 0.0 for episodeNum in range(numEpisodes): G = 0 # my code starts here R,S = blackjack.sample(blackjack.init(),(randint(0, 2))) if (S==(-1)): R=1 while (S!=(-1)): R,S = blackjack.sample(S,(randint(0, 2))) G = R print "Episode: ", episodeNum, "Return: ", G returnSum = returnSum + G print "Average return: ", returnSum/numEpisodes
alpha = 0.001 returnSum = 0.0 Q = [[0,0]]*183 #for i in Q: # i[0],i[1]= uniform(0,1),uniform(0,1) for episodeNum in range(numEpisodes): G = 0 # my code starts here a=0 S =blackjack.init() t = Q[S] if (e > randint(0, 2)): R,S_ =blackjack.sample(S,randint(0,2)) else: if t[0]>t[1]: R,S_ = blackjack.sample(S,0) a=0 else: R,S_ = blackjack.sample(S,1) a=1 while (S_!=(-1)): Q[S][a] = Q[S][a] + alpha*(R + Q[S_][0]+Q[S_][1]-Q[S][a]) S=S_ t = Q[S] if (e > randint(0, 2)): R,S_ = blackjack.sample(S,randint(0,2)) else: if t[0]>t[1]:
def policyPrint(state): return argmax(Q[state]) for episodeNum in range(numEpisodes): #blackjack.init() G = 0 state = 0 while state != -1: #take action according the the beahaviour policy if rand() <= epsilonMu: action = randint(2) else: action = argmax(Q[state]) #Do that action result = blackjack.sample(state,action) reward = result[0] newState = result[1] #Expected Sarsa Q[state, action] = Q[state, action] + alpha *(reward + policySum(newState,epsilonPi) - Q[state, action]) #update values G+= reward state = newState if episodeNum % 10000 == 0 and episodeNum != 0: print "Episode: ", episodeNum, "Return: ", G, "Average return: ", returnSum/(episodeNum) returnSum = returnSum + G print "Average return: ", returnSum/numEpisodes
import blackjack from pylab import * import numpy as np import random numEpisodes = 2000 returnSum = 0.0 """ Returns equiprobable random policy """ def policy(): return random.randint(0,1) """ Experiment """ for episodeNum in range(numEpisodes): G = 0 state=0 blackjack.init() while (state != -1): returntuple=blackjack.sample(state,policy()) reward=returntuple[0] state=returntuple[1] G += reward print "Episode: ", episodeNum, "Return: ", G returnSum = returnSum + G print "Average return: ", returnSum/numEpisodes
numEpisodesLearn = 1000000 numEpisodesEval = 10000000 alpha = 1e-3 eps_mu = 1e-2 eps_pi = 1e-2 Q = 1e-4 * np.random.random((2 + numStates, numActions)) Q[-1] = np.zeros((numActions)) returnSum = 0.0 for episodeNum in xrange(numEpisodesLearn): G = 0.0 s = blackjack.init() while (s != -1): a = np.argmax(Q[s]) if np.random.random() > eps_mu \ else np.random.randint(numActions) (r, sp) = blackjack.sample(s, a) v_pi = eps_pi * np.average(Q[sp]) + (1 - eps_pi) * np.max(Q[sp]) Q[s, a] += alpha * (r + gamma * v_pi - Q[s, a]) G = r + gamma * G s = sp returnSum += G ep = episodeNum + 1 if (ep % 10000 == 0): print "Episode: ", ep, "Average return: ", returnSum / ep print "Average return while learning: ", returnSum / numEpisodesLearn greedy = lambda s: np.argmax(Q[s]) blackjack.printPolicy(greedy) returnSum = 0.0 for episodeNum in xrange(numEpisodesEval):
#while S is not in terminal state while S != -1: #Choose action here based on epsilon decider = random.random() if decider <= epsilon: A = randint(0,1) else: # A = the best action to take if Q[S][0] >= Q[S][1]: A = 0 else: A = 1 R,Sprime = blackjack.sample(S,A) G = G + R if episodeNum > dropAlpha: print R if Sprime == -1: Q[S][A] = Q[S][A] + alpha*(R - Q[S][A]) else: Q[S][A] = Q[S][A] + alpha*(R + gamma*(max(Q[Sprime][0],Q[Sprime][1])) - Q[S][A]) S = Sprime if episodeNum == dropEpsilonEpisode: #print "=============================END EXPLORING PHASE============================="
def giveSample(self,state, action): return blackjack.sample(state, action)
def gameStart(self): return blackjack.sample(0,1)
Q = zeros((M, N)) returnSum = 0.0 epsilon = 0.1 alpha = 0.001 for episodeNum in range(numEpisodes): random.seed(episodeNum) # Cumulative reward G = 0 # Init the game of blackjack and get the initial state s = blackjack.init() #Consider each step of the episode while s!=-1: #-1 is terminal # Take epsilon greedy action at each step of episode a = getEpsilonGreedyAction(Q, s, epsilon) r,sp = blackjack.sample(s,a) # Update action value function with Q-learning off-policy update Q[s, a] = Q[s, a] + alpha * (r + max(Q[sp, :]) - Q[s, a]) G += r s=sp if not(episodeNum % 10000) : print("Episode: ", episodeNum, "Return: ", G) returnSum = returnSum + G print("Average return: ", returnSum/numEpisodes) blackjack.printPolicy(getLearnedPolicy) # Run learned policy policySum = 0.0 for policyEpisodeNum in range(numEpisodes):
def random_policy (list_of_actions): #returns a random action from a list of possible actions next_action = choice(list_of_actions) #print next_action return next_action numEpisodes = 10000 returnSum = 0.0 actions = [0,1] for episodeNum in range(numEpisodes): s = blackjack.init(); G = 0 while (s != -1): a = random_policy (actions) result = blackjack.sample (s,a) #print blackjack.sample (0, 1) G = G + result[0] s = result[1] print "Episode: ", episodeNum, "Return: ", G returnSum = returnSum + G print "Average return: ", returnSum/numEpisodes printPolicy(Q)
G = 0 #Start a new game of blackjack currentState = blackjack.init() #Continue this game until the terminal state is reached while(currentState != -1): #Get a random number between 0 and 1, if its less than epsilon behavior, then explore rnumber = n.random.rand() if rnumber < epsilon: action = n.random.randint(2) else: #If not exploring, pick the highest action at state S action = returnPolicy(currentState) #Get the next state, get reward and next state next = blackjack.sample(currentState, action) reward = next[0] nextstate = next[1] #Add to return G = G + reward #Get chance of being greedy greedychance = 1-epsilon #Get best value at the next state highest = argmax(states[nextstate]) #Expected sarsa calculation (greedy * best_next_state_action) + (explore * (0.5*next_state_action1 + 0.5*next_state_action2)) target = (greedychance * states[nextstate][highest]) + (epsilon * (0.5*states[nextstate][0] + 0.5*states[nextstate][1])) states[currentState][action] = states[currentState][action] + alpha * (reward + target - states[currentState][action])
if np.argmax(Q[s]) == a: return 1 - e + e/num_actions else: return e/num_actions #Learning the policy through the Expected Sarsa algorithm Q = 0.00001*np.random.rand(num_states,num_actions) for episodeNum in range(numEpisodes): G = 0 s = bj.init() while s != -1: a = np.random.choice(2, p=[actionProb(emu,0,s),actionProb(emu,1,s)]) r, s1 = bj.sample(s,a) Q[s,a] = Q[s,a] + alpha*(r + actionProb(epi,0,s1)*Q[s1,0] + actionProb(epi,1,s1)*Q[s1,1] - Q[s,a]) s = s1 G+=r returnSum = returnSum + G if episodeNum%10000 == 0: print "Episode: ", episodeNum print "Average return: ", returnSum/(episodeNum+1) #Function for the learned policy def learnedPolicy(s): return np.argmax(Q[s]) #Printing out the learned policy
#--------------------------------------------------------------- # Course: CMPUT 366 # Assignment: Project1 # Due Date: Nov 5, 2015 # Names: Mujda Abbasi - Zainab Alsharif # Student ID: 1298314 1223455 #--------------------------------------------------------------- import blackjack as bj import numpy as np from pylab import * numEpisodes = 2000 returnSum = 0.0 for episodeNum in range(numEpisodes): G = 0 s = bj.init() while s != -1: r, s = bj.sample(s, np.random.randint(2)) G += r print "Episode: ", episodeNum, "Return: ", G returnSum = returnSum + G print "Average return: ", returnSum / numEpisodes
import blackjack from pylab import * from random import randint numEpisodes = 2000 returnSum = 0.0 for episodeNum in range(numEpisodes): G = 0 S = blackjack.init() while S != -1: R, Sprime = blackjack.sample(S, randint(0,1)) G = G + R S = Sprime print "Episode: ", episodeNum, "Return: ", G returnSum = returnSum + G print "Average return: ", returnSum/numEpisodes
import blackjack import numpy as np from pylab import * numEpisodes = 2000 returnSum = 0.0 for episodeNum in range(numEpisodes): s = blackjack.init() while s!=-1: a = np.random.randint(0,2) G,s_=blackjack.sample(s,a) s=s_ print "Episode: ", episodeNum, "Return: ", G returnSum = returnSum + G print "Average return: ", returnSum/numEpisodes
def giveSample(state): return blackjack.sample(state, giveAction())
import blackjack from pylab import * from random import * numEpisodes = 2000 returnSum = 0.0 for episodeNum in range(numEpisodes): G =0 black = blackjack.init() action =[0,1] while black!=-1: num = randint(0,1) n,black = blackjack.sample(black,action[num]) G+=n print "Episode: ", episodeNum, "Return: ", G returnSum = returnSum + G print "Average return: ", returnSum/numEpisodes
return Q[state][np.argmax(Q[state])] """ Experiments: First learn policy and calculate average return """ for episodeNum in range(numEpisodes): blackjack.init() state=0 return1=0 while (state != -1): action = policy(state) reward,statep=blackjack.sample(state,action) Q[state][action] = Q[state][action] + alpha*(reward + expectedValue(statep) - Q[state][action]) state = statep return1+=reward returnSum+=return1 if (((episodeNum % 10000) == 0) and (episodeNum != 0)): print "Count =",episodeNum,"Average return: ", returnSum/(episodeNum) blackjack.printPolicy(learnedPolicy) print "Average return: ", float(returnSum)/float(numEpisodes) returnSumLearned=0 """
from pylab import * import numpy as np import random numEpisodes = 2000 returnSum = 0.0 """ Returns equiprobable random policy """ def policy(): return random.randint(0, 1) """ Experiment """ for episodeNum in range(numEpisodes): G = 0 state = 0 blackjack.init() while (state != -1): returntuple = blackjack.sample(state, policy()) reward = returntuple[0] state = returntuple[1] G += reward print "Episode: ", episodeNum, "Return: ", G returnSum = returnSum + G print "Average return: ", returnSum / numEpisodes
#--------------------------------------------------------------- # Course: CMPUT 366 # Assignment: Project1 # Due Date: Nov 5, 2015 # Names: Mujda Abbasi - Zainab Alsharif # Student ID: 1298314 1223455 #--------------------------------------------------------------- import blackjack as bj import numpy as np from pylab import * numEpisodes = 2000 returnSum = 0.0 for episodeNum in range(numEpisodes): G = 0 s = bj.init() while s != -1: r, s = bj.sample(s,np.random.randint(2)) G+=r print "Episode: ", episodeNum, "Return: ", G returnSum = returnSum + G print "Average return: ", returnSum/numEpisodes