Exemplo n.º 1
0
def run(numEvaluationEpisodes):
    returnSum = 0.0
    for episodeNum in range(numEvaluationEpisodes):
        G = 0
        gamma = 1
        # Choose a random action - 1 or 0
        action = random.randint(0, 1)
        # initial state at the start of a new blackjack game
        state = blackjack.init()
        # Next state and reward from current state and action
        stateAction = blackjack.sample(state, action)
        reward = stateAction[0]
        state = stateAction[1]
        G = G + reward
        #print("reward1, newstate", reward, state)

        # go through every state in the game of blackJ
        while state != False:
            # Choose a random action - 1 or 0
            action = random.randint(0, 1)
            # Next state and reward from current state and action
            stateAction = blackjack.sample(state, action)
            reward = stateAction[0]
            state = stateAction[1]
            #print("reward, newstate", reward, state)
            G = G + reward

        #print("Episode: ", episodeNum, "Return: ", G)
        returnSum = returnSum + G
    return returnSum / numEvaluationEpisodes


#run(numEvaluationEpisodes)
Exemplo n.º 2
0
def evaluate(numEvaluationEpisodes):
    returnSum = 0.0
    for episodeNum in range(numEvaluationEpisodes):
        G = 0
        s = blackjack.init()  # initial state
        # Choose an action with greedy policy
        a = policy(s)
        # Next state and reward from current state and action
        stateAction = blackjack.sample(s, a)
        newState = stateAction[1]
        reward = stateAction[0]
        G = G + reward

        # go through every state in the game of blackJ
        while newState != False:

            a = policy(newState)
            # Next state and reward from current state and action
            stateAction = blackjack.sample(newState, a)
            reward = stateAction[0]
            newState = stateAction[1]
            G = G + reward

        # Use deterministic policy from Q1 and Q2 to run a number of
        # episodes without updates. Return average return of episodes.
        returnSum = returnSum + G
    return returnSum / numEvaluationEpisodes
Exemplo n.º 3
0
def learn(alpha, eps, numTrainingEpisodes):
    returnSum = 0.0
    for episodeNum in range(numTrainingEpisodes):
        G = 0
        S = blackjack.init()
        R, S = blackjack.sample(S, 1)
        G += R
        while (S):
            Q = Q1[S,:]+Q2[S,:]
            prob1 = np.random.random()
            if prob1 < eps:
                # explore
                A = np.random.choice([0, 1])
            else:
                # greedy
                A = Q.argmax()

            R, S_prime = blackjack.sample(S, A)
            G += R
            S_prime = int(S_prime)

            prob2 = np.random.choice([1, 2])
            if prob2 == 1:
                Q1[S, A] = Q1[S, A] + alpha * (
                R + GAMMA * Q2[S_prime, (Q1[S_prime]).argmax()] - Q1[S, A])
            else:
                Q2[S, A] = Q2[S, A] + alpha * (R + GAMMA * Q1[S_prime, (Q2[S_prime]).argmax()] - Q2[S, A])

            S = S_prime
        #print("Episode: ", episodeNum, "Return: ", G)
        returnSum = returnSum + G
        if episodeNum % 100000 == 0 and episodeNum != 0:
            print("Average return so far: ", returnSum/episodeNum)
def run(numEvaluationEpisodes):
    returnSum = 0.0
    for episodeNum in range(numEvaluationEpisodes):
        G = 0
        R, S = blackjack.sample(0, 1)
        if S == False:
            G = G + R

        else:
            while S != False:  #if state a!= terminal state
                R, S = blackjack.sample(S, randint(0, 2))
                G = G + R
        print("Episode: ", episodeNum, "Return: ", G)
        returnSum = returnSum + G
    return returnSum / numEvaluationEpisodes
Exemplo n.º 5
0
def evaluate(numEvaluationEpisodes):
    returnSum = 0.0
    for episodeNum in range(numEvaluationEpisodes):
        G = 0
        S = blackjack.init()
        R, S = blackjack.sample(S, 1)
        G += R
        while (S):
            Q = Q1[S, :] + Q2[S, :]
            A = Q.argmax()
            R, S = blackjack.sample(S, A)
            G += R

        returnSum = returnSum + G
    return returnSum / numEvaluationEpisodes
def run(numEvaluationEpisodes):
    returnSum = 0.0
    for episodeNum in range(numEvaluationEpisodes):
        G = 0
        S = blackjack.init()
        A = [1, 0]
        a = np.random.choice(A)
        R, S = blackjack.sample(S, a)
        G += R
        while (S):
            a = np.random.choice(A)
            R, S = blackjack.sample(S, a)
            G += R
        #print("Episode: ", episodeNum, "Return: ", G)
        returnSum = returnSum + G
    return returnSum / numEvaluationEpisodes
Exemplo n.º 7
0
def deter_policy(state):
	global returnSum
	s=state
	while s!=-1:
		r,s_=bj.sample(s,sarsa_policy(s))
		s=s_
	returnSum=returnSum + r
Exemplo n.º 8
0
def evaluateEpisode(G, eps):
    currentState = blackjack.init() # returns the initial state
    while(True):  # repeate for each step
        (reward, currentState) = blackjack.sample(currentState, epsGreedyPolicy(currentState, eps))
        G += reward
        if(not currentState): # if currentState is false (we know its the end of the episode)
            return G
Exemplo n.º 9
0
def learn(alpha, eps, numTrainingEpisodes):
    returnSum = 0.0
    for episodeNum in range(numTrainingEpisodes):
        s = blackjack.init()
        G = 0
        discount = 1
        while (s is not False):
            if np.random.randint(0, 101) >= eps * 100:
                if ((Q1[0, s] + Q2[0, s]) >= (Q1[1, s] + Q2[1, s])):
                    a = 0
                else:
                    a = 1
            else:
                a = np.random.randint(0, 2)
            #a = 1 # Q1 + Q2 egreedy   argmax(sum Q1+Q2)
            r, s1 = blackjack.sample(s, a)
            if np.random.randint(0, 2) == 1:
                c = discount * Q2[np.argmax(Q2, 0)[s1], s1]
                Q1[a, s] = Q1[a, s] + alpha * (r + c - Q1[a, s])
            else:
                c = discount * Q1[np.argmax(Q1, 0)[s1], s1]
                Q2[a, s] = Q2[a, s] + alpha * (r + c - Q2[a, s])
                #Q2 <- Q2 + alpha(R + discount (S', argmaxQ1(S',a)) - Q2)
            #G+=r
            s = s1
Exemplo n.º 10
0
def learn(alpha, eps, numTrainingEpisodes):
    returnSum = 0.0
    for episodeNum in range(numTrainingEpisodes):
        G = 0
        currentState = blackjack.init()
        terminate = False
        while not terminate:
            if randint(0, 101) > eps * 100:  #greedy action
                action = argmax(theta1[currentState] + theta2[currentState])
            else:  #Epsilon action (explore)
                action = randint(0, 2)
            G, nextState = blackjack.sample(currentState, action)
            if randint(0, 2) == 0:  #0.5 probability
                theta1[currentState,
                       action] = theta1[currentState, action] + alpha * (
                           G + theta2[nextState,
                                      argmax(theta1[nextState])] -
                           theta1[currentState, action])
            else:  #0.5 probability
                theta2[currentState,
                       action] = theta2[currentState, action] + alpha * (
                           G + theta1[nextState,
                                      argmax(theta2[nextState])] -
                           theta2[currentState, action])
            currentState = nextState
            if not nextState:
                terminate = True
            #print("Episode: ", episodeNum, "Return: ", G)

        returnSum = returnSum + G
Exemplo n.º 11
0
def learn(alpha, eps, numTrainingEpisodes):

    # Fill in Q1 and Q2
    gamma = 1

    for i in range(1, 181):
        Q1[i][0] = random() * 0.00001
        Q1[i][1] = random() * 0.00001

        Q2[i][0] = random() * 0.00001
        Q2[i][1] = random() * 0.00001

    returnSum = 0
    for episodeNum in range(numTrainingEpisodes):
        state = blackjack.init()
        G = 0
        R, S = blackjack.sample(state, 1)
        if S == False:
            G = G + R

        Q = Q1 + Q2
        while S != False:
            if eps > random():
                A = randint(0, 2)  #esp bigger then do random action
            else:  #else choose the biggest one in perious

                if Q[S, 0] > Q[S, 1]:
                    A = 0
                else:
                    A = 1

            RR, nexstate = blackjack.sample(S, A)
            G = G + RR
            pro = randint(0, 2)
            if pro == 1:
                error = RR + gamma * Q2[nexstate][argmax(
                    Q1[nexstate])] - Q1[S][A]
                Q1[S][A] = Q1[S][A] + alpha * (error)

            else:
                error = RR + gamma * Q1[nexstate][argmax(
                    Q2[nexstate])] - Q2[S][A]
                Q2[S][A] = Q2[S][A] + alpha * (error)
            S = nexstate
    # Fill in Q1 and Q2

        returnSum = returnSum + G
Exemplo n.º 12
0
def rand_policy(state):
	global returnSum
	s=state
	while s!=-1:
		a=np.random.randint(0,2)
		r,s_=bj.sample(s,a)
		Q[s,a]=Q[s,a]+alpha*(r+0.5*Q[s_,0]+0.5*Q[s_,1]-Q[s,a])
		s=s_
	returnSum=returnSum + r
Exemplo n.º 13
0
def learn(alpha, eps, numTrainingEpisodes):
    returnSum = 0.0
    for episodeNum in range(numTrainingEpisodes):
        S = blackjack.init()
        G = 0
        A = 0
        R, S = blackjack.sample(S, A)
        G += R  # ACCOUNTS FOR THE NATURAL (INSTANT WIN OR DRAW)

        # iterate for each step of the episode
        while S:
            if np.random.random() > eps:
                if Q1[S][0] + Q2[S][0] >= Q1[S][1] + Q2[S][1]:
                    A = 0
                    R, nS = blackjack.sample(S, A)
                elif Q1[S][0] + Q2[S][0] < Q1[S][1] + Q2[S][1]:
                    A = 1
                    R, nS = blackjack.sample(S, A)
            else:
                A = np.random.randint(0, 2)
                R, nS = blackjack.sample(S, A)

            # 0.5 probability of doing Q1 or Q2
            prob = np.random.randint(0, 2)
            if not nS:
                if prob == 1:
                    Q1[S][A] = Q1[S][A] + alpha * (R - Q1[S][A])
                else:
                    Q2[S][A] = Q2[S][A] + alpha * (R - Q2[S][A])
            else:
                if prob == 1:
                    Q1[S][A] = Q1[S][A] + alpha * (
                        R + Q2[nS][np.argmax(Q1, 1)[nS]] - Q1[S][A])
                else:
                    Q2[S][A] = Q2[S][A] + alpha * (
                        R + Q1[nS][np.argmax(Q2, 1)[nS]] - Q2[S][A])
            S = nS
            G += R
        #print("Episode: ", episodeNum, "Return: ", G)
        returnSum = returnSum + G
        if episodeNum % 10000 == 0 and episodeNum != 0:
            blackjack.printPolicy(policy)
            print("Average return so far: ", returnSum / episodeNum)
Exemplo n.º 14
0
def run(numEvaluationEpisodes):
    returnSum = 0.0

    for episodeNum in range(numEvaluationEpisodes):
        G = 0  #reward
        R = 0  #return
        S = blackjack.init()
        A = numpy.random.randint(0, 2)
        R, S = blackjack.sample(S, 0)
        G += R

        # loops until terminal state
        while S != False:
            A = numpy.random.randint(0, 2)
            R, S = blackjack.sample(S, A)
            # adds to the return
            G += R
        #print("Episode: ", episodeNum, "Return: ", G)
        returnSum = returnSum + G
    return returnSum / numEvaluationEpisodes
Exemplo n.º 15
0
def evaluate(numEvaluationEpisodes):
    returnSum = 0.0
    for episodeNum in range(numEvaluationEpisodes):
        G = 0
        S = blackjack.init()
        A = 0
        R, S = blackjack.sample(S, A)
        G += R
        while S:
            if Q1[S][0] + Q2[S][0] >= Q1[S][1] + Q2[S][1]:
                A = 0
                R, S = blackjack.sample(S, A)
            else:
                A = 1
                R, S = blackjack.sample(S, A)
            G += R
        # Use deterministic policy from Q1 and Q2 to run a number of
        # episodes without updates. Return average return of episodes.
        returnSum = returnSum + G
    return returnSum / numEvaluationEpisodes
Exemplo n.º 16
0
def evaluate(numEvaluationEpisodes):
    returnSum = 0.0
    Q = Q1 + Q2
    for episodeNum in range(numEvaluationEpisodes):
        G = 0
        R, S = blackjack.sample(0, 1)
        if S == False:
            G = G + R

        while S != False:  #if state a!= terminal state
            if Q[S, 0] >= Q[S, 1]:
                A = 0
            else:
                A = 1
            R, S = blackjack.sample(S, A)
            G = G + R
            #print("Episode: ", episodeNum, "Return: ", G)
        returnSum = returnSum + G

    return returnSum / numEvaluationEpisodes
Exemplo n.º 17
0
def runLearnedPolicy():
	G = 0
	# Init the game of blackjack and get the initial state
	s = blackjack.init()
	#Consider each step of the episode
	while s!=-1: #-1 is terminal
		# Take the action given by learned policy
		a = getLearnedPolicy(s)
		r,sp = blackjack.sample(s,a)
		G += r
		s=sp
	return G
Exemplo n.º 18
0
def assessPolicy(policy, numEpisodes):
    returnSum = 0.0
    for episodeNum in range(numEpisodes):
        G = 0
        S = blackjack.init()
        while S is not False:
            A = policy(S)
            R, Sprime = blackjack.sample(S, A)
            G += R
            S = Sprime
        returnSum += G
    return returnSum / numEpisodes
Exemplo n.º 19
0
def showOneGame():
	s=blackjack.init()
	moves=[0,1,0] 
	turn=0
	while s!=-1: #-1 is terminal
		a=moves[turn]
		r,sp=blackjack.sample(s,a)
		print("turn %d: s %d a %d -> r %d sp %d "%(turn,s,a,r,sp),end="")
		print("\t Player Sum: %d  Dealer Card: %d  Usable Ace: %d"%(blackjack.playerSum,blackjack.dealerCard, blackjack.usableAce))
		s=sp
		turn+=1
	return None
Exemplo n.º 20
0
def episode(G, discount):
    currentState = blackjack.init()  # returns the initial state
    counter = 0
    while (True):
        (reward,
         currentState) = blackjack.sample(currentState,
                                          chooseActionFromState(currentState))
        G += (discount**counter) * reward
        counter += 1  # Need to inc after using it to calculate the return (G)
        if (not currentState
            ):  # if currentState is false (we know its the end of the episode)
            return G
Exemplo n.º 21
0
def run(numEvaluationEpisodes):
    returnSum = 0.0
    for episodeNum in range(numEvaluationEpisodes):
        G = 0
        s = blackjack.init()
        while (s is not False):
            a = np.random.randint(0, 2)
            r, s = blackjack.sample(s, a)
            G += r
        print("Episode: ", episodeNum, "Return: ", G)
        returnSum = returnSum + G
    return returnSum / numEvaluationEpisodes
Exemplo n.º 22
0
def showOneGame():
    s=blackjack.init()
    moves=[0,1,0] 
    turn=0
    while s!=-1: #-1 is terminal
        a=moves[turn]
        r,sp=blackjack.sample(s,a)
        print("turn %d: s %d a %d -> r %d sp %d "%(turn,s,a,r,sp),end="")
        print("\t Player Sum: %d  Dealer Card: %d  Usable Ace: %d"%(blackjack.playerSum,blackjack.dealerCard, blackjack.usableAce))
        s=sp
        turn+=1
    return None
Exemplo n.º 23
0
def showOneGame():
    G = 0
    s=blackjack.init()
    turn=0
    while s!=-1: #-1 is terminal
        a=randint(0,1)
        r,sp=blackjack.sample(s,a)
        print("turn %d: s %d a %d -> r %d sp %d "%(turn,s,a,r,sp),end="")
        print("\t Player Sum: %d  Dealer Card: %d  Usable Ace: %d"%(blackjack.playerSum,blackjack.dealerCard, blackjack.usableAce))
        turn+=1
        s=sp
        G=G+r
    return G
def showOneGame():
    s=blackjack.init()
    moves=[0,1] 
    turn=0
    Reward_sum = 0
    while s!=-1: #-1 is terminal
        a= moves[turn]
        r,sp=blackjack.sample(s,a)
        #print("turn %d: s %d a %d -> r %d sp %d "%(turn,s,a,r,sp),end="")
        #print("\t Player Sum: %d  Dealer Card: %d  Usable Ace: %d"%(blackjack.playerSum,blackjack.dealerCard, blackjack.usableAce))
        s=sp
        turn=random.randint(0,1)
        Reward_sum +=r
    return Reward_sum
Exemplo n.º 25
0
def evaluate(numEvaluationEpisodes):
    returnSum = 0.0
    for episodeNum in range(numEvaluationEpisodes):
        G = 0
        currentState = blackjack.init()
        terminate = False
        while not terminate:
            action = policy(currentState)
            G, nextState = blackjack.sample(currentState, action)
            currentState = nextState
            if not nextState:
                terminate = True
        returnSum = returnSum + G
    return returnSum / numEvaluationEpisodes
Exemplo n.º 26
0
def run(numEvaluationEpisodes):

    returnSum = 0.0
    for episodeNum in range(numEvaluationEpisodes):  #loop for 2000 iterations
        G = 0
        state = blackjack.init()
        while (True):  # if not reach the terminal state
            reward, state = blackjack.sample(state, np.random.randint(0, 2))
            G = G + reward
            if state == False:
                break

        #print("Episode: ", episodeNum, "Return: ", G)
        returnSum = returnSum + G
    return returnSum / numEvaluationEpisodes
Exemplo n.º 27
0
def evaluate(numEvaluationEpisodes):
    returnSum = 0.0
    for episodeNum in range(numEvaluationEpisodes):
        G = 0
        s = blackjack.init()
        while (s is not False):
            if ((Q1[0, s] + Q2[0, s]) >= (Q1[1, s] + Q2[1, s])):
                a = 0
            else:
                a = 1
            #a = np.random.randint(0,2)#Q1+Q2 (s,1) vs Q1+Q2(s,0)
            r, s = blackjack.sample(s, a)
            G += r
        #print("Episode: ", episodeNum, "Return: ", G)
        returnSum = returnSum + G
    return returnSum / numEvaluationEpisodes
Exemplo n.º 28
0
def Qlearning(ex):
  
  # Initial the state (deal the first card )
  s=blackjack.init()
  segma_r = 0
  while s!=-1:                                      # -1 is terminal
    a = argmax([ex[s,0], ex[s,1]])                 # Choose argmax(Q(s,a))   
    if random.uniform(0,1) < epsilon/2:            # e-greedy
      a = abs(a-1)    
      
    # Q(s,a) <- Q(s,a) + alpha * (r + argmax(Q(sp,a)) - Q(s,a))
    
    r,sp=blackjack.sample(s,a)                      # Get the reward and s'
    ex[s,a] += alpha * (r - ex[s,a] + ex[sp,argmax([ex[sp,0],ex[sp,1]])])  
    s=sp; segma_r += r                              # Return the value and next state
  return segma_r                                   
Exemplo n.º 29
0
def evaluate(numEvaluationEpisodes):
    returnSum = 0.0
    for episodeNum in range(numEvaluationEpisodes):
        G = 0
        state = blackjack.init()
        while (True):
            # choose the action greedy wrt the sum of two action values
            reward, state = blackjack.sample(state,
                                             argmax(Q1[state] + Q2[state]))
            G = G + reward

            if state == False:
                break
        returnSum = returnSum + G
    #print ("Determinstic return after learning: ",returnSum/numEvaluationEpisodes)

    return returnSum / numEvaluationEpisodes
Exemplo n.º 30
0
        def qLearning(self):
          for i in range(1,181):
            randomValue1 = random.random()
            randomValue2 = random.random()
            
            randomValue1 = randomValue1 * 0.00001
            randomValue2 = randomValue2 * 0.00001
            self.q[i][0] = randomValue1
            self.q[i][1] = randomValue2
          
          iterations = 0
          returnSum = 0
          while iterations < self.MAXITERATIONS:      
            s = blackjack.init()
            reward, state = blackjack.sample(s,1)
            if state == -1:
              returnSum = returnSum+reward
            while state != -1:
              A = self.eGreedy(self.q,state)
              reward, statePrime = self.giveSample(state, A)
              returnSum = returnSum + reward
              if reward == 0 and statePrime != -1:
                theMaxAction = self.maxAction(self.q, statePrime)
                newStateMaxQSA = self.q[statePrime][theMaxAction]
              else:
                newStateMaxQSA = 0
              
              if self.ALPHA == "Dynamic":
                      #print("YES")
                 ALPHA = self.getDynamicAlpha(state,A)
              else:
                 ALPHA = self.ALPHA

              bracketValue = reward+(self.GAMMA*newStateMaxQSA)-self.q[state][A]
              self.q[state][A] = self.q[state][A]+ALPHA*(bracketValue)  
              state = statePrime
            
            iterations = iterations + 1
            if self.printEveryOneThousandEpisodes and iterations % 10000 == 0:
                print("Average Return During Learning Phase at "+str(iterations)+" is "+str(returnSum/iterations))

          
          print("The Policy learned From the Exploration Phase is : ")
          blackjack.printPolicy(self.printPolicy2)
          return returnSum/self.MAXITERATIONS
Exemplo n.º 31
0
def learn(alpha, eps, numTrainingEpisodes):
    returnSum = 0.0
    gamma = 1.0
    for episodeNum in range(numTrainingEpisodes):
        G = 0
        state = blackjack.init()
        while (True):
            # choose action, from a epsilon greedy
            num = np.random.random()
            if (num >= eps):
                action = argmax(Q1[state] + Q2[state])
            else:
                action = np.random.randint(0, 2)

            # perform action
            if state == 0:
                reward, nextState = blackjack.firstSample()
            else:
                reward, nextState = blackjack.sample(state, action)

            # to deal with the terminal state
            if nextState == False:
                nextState = 0

            if np.random.randint(0, 2):  # with 0.5 probability
                Q1[state][action] = Q1[state][action] + alpha * (
                    reward + gamma * Q2[nextState][argmax(Q1[nextState])] -
                    Q1[state][action])
            else:  # with 0.5 probability
                Q2[state][action] = Q2[state][action] + alpha * (
                    reward + gamma * Q1[nextState][argmax(Q2[nextState])] -
                    Q2[state][action])

            # update state
            state = nextState
            G = G + reward  # update the return for state 0 with discount ratio gamma=1

            if state == False:
                break
        returnSum = returnSum + G
        if episodeNum % 10000 == 0 and episodeNum != 0:
            #print("Average return so far: ", returnSum / episodeNum)
            pass
Exemplo n.º 32
0
def learnEpisode(alpha, eps, gamma):
        currentState = blackjack.init() # returns the initial state
        episodeReturn = 0
        while(True):  # repeate for each step of the episode
            action = epsGreedyPolicy(currentState, eps)
            (reward, nextState) = blackjack.sample(currentState, action)
            episodeReturn += reward
            if(nextState):
                if(np.random.randint(0,2)):  # will return ints between [0,2)
                    Q1[currentState, action] = Q1[currentState, action] + alpha * ( reward + gamma * Q2[nextState, np.argmax(Q1[nextState])] - Q1[currentState, action])
                else:
                    Q2[currentState, action] = Q2[currentState, action] + alpha * ( reward + gamma * Q1[nextState, np.argmax(Q2[nextState])] - Q2[currentState, action])
                currentState = nextState
            else: # we know its the terminal state so the 'next rewards' simplify to 0 and can be ommited
                if(np.random.randint(0,2)):  # will return ints between [0,2)
                    Q1[currentState, action] = Q1[currentState, action] + alpha * ( reward - Q1[currentState, action])
                else:
                    Q2[currentState, action] = Q2[currentState, action] + alpha * ( reward - Q2[currentState, action])
                return episodeReturn # if nextState is false (we know its the end of the episode)
Exemplo n.º 33
0
def bjrandomPolicy(numEpisodes=10000):
    # Input: number of Episodes
    # Output: Average Return over number of episodes
    # Policy: Equally Random
    returnSum = 0.0
    for episodeNum in range(numEpisodes):
        G = 0
        # Implement equaly random probability assuming gamma = 1
        s=blackjack.init()
        while s!=-1: #-1 is terminal
            # Rand int returns a number between 0, 1
            a=random.randint(0,1)
            G,sp=blackjack.sample(s,a)
            s=sp
        print("Episode: ", episodeNum, "Return: ", G)
        returnSum = returnSum + G

    print("Average return: ", returnSum/numEpisodes)
    return None
Exemplo n.º 34
0
def bjrandomPolicy(numEpisodes=10000):
    # Input: number of Episodes
    # Output: Average Return over number of episodes
    # Policy: Equally Random
    returnSum = 0.0
    for episodeNum in range(numEpisodes):
        G = 0
        # Implement equaly random probability assuming gamma = 1
        s = blackjack.init()
        while s != -1:  #-1 is terminal
            # Rand int returns a number between 0, 1
            a = random.randint(0, 1)
            G, sp = blackjack.sample(s, a)
            s = sp
        print("Episode: ", episodeNum, "Return: ", G)
        returnSum = returnSum + G

    print("Average return: ", returnSum / numEpisodes)
    return None
Exemplo n.º 35
0
def exp_sarsa(state):
	global returnSum
	s=state
	while s!=-1:
		if Q[s,0]>Q[s,1]:
			a=0
		else:
			a=1
		r,s_=bj.sample(s,a)
# a_ is the actin choosen by target epsilon greedy policy
		rand=np.random.random()
		if rand<epi:
#		rand
			a_=np.random.randint(0,2)
		else:
#		greedy
			a_=np.argmax(Q[s_])
		Q[s,a]=Q[s,a]+alpha*(r+Q[s_,a_]-Q[s,a])
		s=s_
	returnSum=returnSum + r
Exemplo n.º 36
0
        def onlyExploitQ(self):          
          iterations = 0
          returnSum = 0
          
          while iterations < self.MAXITERATIONS:
            s = blackjack.init()
            reward, state = blackjack.sample(s,1)
            if state == -1:
              returnSum = returnSum+reward
            while state != -1:
              A = self.maxAction(self.q, state)
              reward, statePrime = self.giveSample(state, A)
              returnSum = returnSum + reward

              state = statePrime
            iterations = iterations + 1
            if self.printEveryOneThousandEpisodes and iterations % 10000 == 0:
                print("Average Return During Exploitation Phase at "+str(iterations)+" is "+str(returnSum/iterations))
            
          return returnSum/self.MAXITERATIONS
Exemplo n.º 37
0
import blackjack
from pylab import *

numEpisodes = 2000

returnSum = 0.0
for episodeNum in range(numEpisodes):
	G = 0
	currentstate = blackjack.init()
	while(currentstate != -1):
		action = randint(2) #randomly pick the action
		next = blackjack.sample(currentstate, action)
		G = G + next[0]
		currentstate = next[1]
	print "Episode: ", episodeNum, "Return: ", G
	returnSum = returnSum + G
print "Average return: ", returnSum/numEpisodes
Exemplo n.º 38
0
import blackjack
from pylab import *

numEpisodes = 2000

returnSum = 0.0
for episodeNum in range(numEpisodes):
    G = 0
    # my code starts here
    
    
    R,S = blackjack.sample(blackjack.init(),(randint(0, 2)))
    if (S==(-1)): R=1
    while (S!=(-1)):
        R,S = blackjack.sample(S,(randint(0, 2)))    
    G = R    
    
            
    print "Episode: ", episodeNum, "Return: ", G 
    returnSum = returnSum + G
print "Average return: ", returnSum/numEpisodes
Exemplo n.º 39
0
alpha = 0.001
returnSum = 0.0

Q = [[0,0]]*183
#for i in Q:
 #   i[0],i[1]= uniform(0,1),uniform(0,1)

for episodeNum in range(numEpisodes):
    G = 0
    # my code starts here
    a=0
    
    S =blackjack.init()
    t = Q[S]
    if (e > randint(0, 2)):
        R,S_ =blackjack.sample(S,randint(0,2))
    else:
        if t[0]>t[1]:
            R,S_ = blackjack.sample(S,0)
            a=0
        else:
            R,S_ = blackjack.sample(S,1)
            a=1
    while (S_!=(-1)):
        Q[S][a] = Q[S][a] + alpha*(R + Q[S_][0]+Q[S_][1]-Q[S][a])
        S=S_
        t = Q[S]
        if (e > randint(0, 2)):
            R,S_ = blackjack.sample(S,randint(0,2))
        else:
            if t[0]>t[1]:
def policyPrint(state):
    return argmax(Q[state])
      
    
for episodeNum in range(numEpisodes):
    #blackjack.init()
    G = 0
    state = 0
    while state != -1:
        #take action according the the beahaviour policy 
        if rand() <= epsilonMu:
            action = randint(2)
        else:
            action = argmax(Q[state])
        #Do that action 
        result = blackjack.sample(state,action)
        reward = result[0]
        newState = result[1]
        
        #Expected Sarsa 
        Q[state, action] = Q[state, action] + alpha *(reward + policySum(newState,epsilonPi) - Q[state, action])
        
        #update values
        G+= reward
        state = newState
         
    if episodeNum % 10000 == 0 and episodeNum != 0:
        print "Episode: ", episodeNum, "Return: ", G, "Average return: ", returnSum/(episodeNum)
    returnSum = returnSum + G
print "Average return: ", returnSum/numEpisodes
Exemplo n.º 41
0
import blackjack
from pylab import *
import numpy as np
import random

numEpisodes = 2000
returnSum = 0.0

"""
Returns equiprobable random policy
"""
def policy():
    return random.randint(0,1)	

"""
Experiment
"""
for episodeNum in range(numEpisodes):
    G = 0
    state=0
    blackjack.init()
    while (state != -1):
        returntuple=blackjack.sample(state,policy()) 
    	reward=returntuple[0]
	state=returntuple[1]
	G += reward
    print "Episode: ", episodeNum, "Return: ", G
    returnSum = returnSum + G
print "Average return: ", returnSum/numEpisodes

Exemplo n.º 42
0
numEpisodesLearn =  1000000
numEpisodesEval  = 10000000
alpha = 1e-3
eps_mu = 1e-2
eps_pi = 1e-2
Q = 1e-4 * np.random.random((2 + numStates, numActions))
Q[-1] = np.zeros((numActions))

returnSum = 0.0
for episodeNum in xrange(numEpisodesLearn):
    G = 0.0
    s = blackjack.init()
    while (s != -1):
        a = np.argmax(Q[s]) if np.random.random() > eps_mu \
            else np.random.randint(numActions)
        (r, sp) = blackjack.sample(s, a)
        v_pi = eps_pi * np.average(Q[sp]) + (1 - eps_pi) * np.max(Q[sp])
        Q[s, a] += alpha * (r + gamma * v_pi - Q[s, a])
        G = r + gamma * G
        s = sp
    returnSum += G
    ep = episodeNum + 1 
    if (ep % 10000 == 0):
        print "Episode: ", ep, "Average return: ", returnSum / ep
print "Average return while learning: ", returnSum / numEpisodesLearn

greedy = lambda s: np.argmax(Q[s])
blackjack.printPolicy(greedy)

returnSum = 0.0
for episodeNum in xrange(numEpisodesEval):
Exemplo n.º 43
0
	#while S is not in terminal state
	while S != -1:

		#Choose action here based on epsilon
		decider = random.random()
		if decider <= epsilon:
			A = randint(0,1)
		else:
			# A = the best action to take
			if Q[S][0] >= Q[S][1]:
				A = 0
			else:
				A = 1

		R,Sprime = blackjack.sample(S,A)
		G = G + R

		if episodeNum > dropAlpha:
			print R


		if Sprime == -1:
			Q[S][A] = Q[S][A] + alpha*(R - Q[S][A])	
		else:
			Q[S][A] = Q[S][A] + alpha*(R + gamma*(max(Q[Sprime][0],Q[Sprime][1])) - Q[S][A])
		
		S = Sprime

	if episodeNum == dropEpsilonEpisode:
		#print "=============================END EXPLORING PHASE============================="
Exemplo n.º 44
0
 def giveSample(self,state, action):
   return blackjack.sample(state, action)
Exemplo n.º 45
0
 def gameStart(self):
   return blackjack.sample(0,1)
Exemplo n.º 46
0
Q = zeros((M, N))

returnSum = 0.0
epsilon = 0.1
alpha = 0.001
for episodeNum in range(numEpisodes):
	random.seed(episodeNum)
	# Cumulative reward
	G = 0
	# Init the game of blackjack and get the initial state
	s = blackjack.init()
	#Consider each step of the episode
	while s!=-1: #-1 is terminal
		# Take epsilon greedy action at each step of episode
		a = getEpsilonGreedyAction(Q, s, epsilon)
		r,sp = blackjack.sample(s,a)
		# Update action value function with Q-learning off-policy update
		Q[s, a] = Q[s, a] + alpha * (r + max(Q[sp, :]) - Q[s, a])
		G += r
		s=sp
	
	if not(episodeNum % 10000) :
		print("Episode: ", episodeNum, "Return: ", G)
	returnSum = returnSum + G
	
print("Average return: ", returnSum/numEpisodes)
blackjack.printPolicy(getLearnedPolicy)

# Run learned policy
policySum = 0.0
for policyEpisodeNum in range(numEpisodes):
Exemplo n.º 47
0
def random_policy (list_of_actions): #returns a random action from a list of possible actions
    next_action = choice(list_of_actions)
    #print next_action
    return next_action

numEpisodes = 10000

returnSum = 0.0
actions = [0,1]


for episodeNum in range(numEpisodes):
    s = blackjack.init();
    G = 0
    while (s != -1):
    	a = random_policy (actions)
        result = blackjack.sample (s,a)
        #print blackjack.sample (0, 1)
        G = G + result[0]
        s = result[1]
    print "Episode: ", episodeNum, "Return: ", G
    returnSum = returnSum + G

print "Average return: ", returnSum/numEpisodes


printPolicy(Q)
	


Exemplo n.º 48
0
    G = 0
    #Start a new game of blackjack
    currentState = blackjack.init()
    #Continue this game until the terminal state is reached
    while(currentState != -1):
        #Get a random number between 0 and 1, if its less than epsilon behavior, then explore
       
        rnumber = n.random.rand()
        if rnumber < epsilon:
            action = n.random.randint(2)
        else:
	    #If not exploring, pick the highest action at state S
            action = returnPolicy(currentState)      
	
	#Get the next state, get reward and next state
        next = blackjack.sample(currentState, action)
        reward = next[0]
        nextstate = next[1]
        #Add to return
        G = G + reward
        
        #Get chance of being greedy
        greedychance = 1-epsilon
        
        #Get best value at the next state
        highest = argmax(states[nextstate])
        
        #Expected sarsa calculation (greedy * best_next_state_action) + (explore * (0.5*next_state_action1 + 0.5*next_state_action2))
        target = (greedychance * states[nextstate][highest]) + (epsilon * (0.5*states[nextstate][0] + 0.5*states[nextstate][1]))
        states[currentState][action] = states[currentState][action] + alpha * (reward + target - states[currentState][action]) 
            
Exemplo n.º 49
0
    if np.argmax(Q[s]) == a:
        return 1 - e + e/num_actions
    else:
        return e/num_actions


#Learning the policy through the Expected Sarsa algorithm
Q =  0.00001*np.random.rand(num_states,num_actions)

for episodeNum in range(numEpisodes):
    G = 0

    s = bj.init()
    while s != -1:
        a = np.random.choice(2, p=[actionProb(emu,0,s),actionProb(emu,1,s)])
        r, s1 = bj.sample(s,a)
        Q[s,a] = Q[s,a] + alpha*(r + actionProb(epi,0,s1)*Q[s1,0] + actionProb(epi,1,s1)*Q[s1,1] - Q[s,a])
        s = s1
        G+=r

    returnSum = returnSum + G

    if episodeNum%10000 == 0:
        print "Episode: ", episodeNum
        print "Average return: ", returnSum/(episodeNum+1)

#Function for the learned policy
def learnedPolicy(s):
    return np.argmax(Q[s])

#Printing out the learned policy
Exemplo n.º 50
0
#---------------------------------------------------------------
# Course:           CMPUT 366
# Assignment:       Project1
# Due Date:         Nov 5, 2015
# Names:            Mujda Abbasi - Zainab Alsharif
# Student ID:         1298314         1223455
#---------------------------------------------------------------

import blackjack as bj
import numpy as np
from pylab import *

numEpisodes = 2000
returnSum = 0.0

for episodeNum in range(numEpisodes):
    G = 0
    s = bj.init()
    while s != -1:
        r, s = bj.sample(s, np.random.randint(2))
        G += r

    print "Episode: ", episodeNum, "Return: ", G
    returnSum = returnSum + G
print "Average return: ", returnSum / numEpisodes
Exemplo n.º 51
0
import blackjack
from pylab import *
from random import randint

numEpisodes = 2000

returnSum = 0.0
for episodeNum in range(numEpisodes):
    G = 0
    S = blackjack.init()

    while S != -1:
   		R, Sprime = blackjack.sample(S, randint(0,1))
		G = G + R
		S = Sprime

    print "Episode: ", episodeNum, "Return: ", G
    returnSum = returnSum + G
print "Average return: ", returnSum/numEpisodes
Exemplo n.º 52
0
import blackjack
import numpy as np
from pylab import *

numEpisodes = 2000
returnSum = 0.0

for episodeNum in range(numEpisodes):
	s = blackjack.init()
	while s!=-1:
		a = np.random.randint(0,2)
		G,s_=blackjack.sample(s,a)
		s=s_
	print "Episode: ", episodeNum, "Return: ", G
	returnSum = returnSum + G
print "Average return: ", returnSum/numEpisodes
Exemplo n.º 53
0
def giveSample(state):
    return blackjack.sample(state, giveAction())
Exemplo n.º 54
0
import blackjack
from pylab import *
from random import *

numEpisodes = 2000

returnSum = 0.0
for episodeNum in range(numEpisodes):
    G =0
    black = blackjack.init()
    action =[0,1]
    while black!=-1:
        num = randint(0,1)
        n,black = blackjack.sample(black,action[num])
        
        G+=n

    print "Episode: ", episodeNum, "Return: ", G
    returnSum = returnSum + G
print "Average return: ", returnSum/numEpisodes
Exemplo n.º 55
0
		return Q[state][np.argmax(Q[state])]


"""
Experiments:

First learn policy and calculate average return
"""

for episodeNum in range(numEpisodes):
	blackjack.init()
	state=0
	return1=0
	while (state != -1):
		action = policy(state)
		reward,statep=blackjack.sample(state,action) 
		Q[state][action] = Q[state][action] + alpha*(reward + expectedValue(statep) - Q[state][action])
		state = statep
		return1+=reward
	returnSum+=return1
	if (((episodeNum % 10000) == 0) and (episodeNum != 0)):
		print "Count =",episodeNum,"Average return: ", returnSum/(episodeNum)
	



blackjack.printPolicy(learnedPolicy)
print "Average return: ", float(returnSum)/float(numEpisodes)
returnSumLearned=0

"""
from pylab import *
import numpy as np
import random

numEpisodes = 2000
returnSum = 0.0
"""
Returns equiprobable random policy
"""


def policy():
    return random.randint(0, 1)


"""
Experiment
"""
for episodeNum in range(numEpisodes):
    G = 0
    state = 0
    blackjack.init()
    while (state != -1):
        returntuple = blackjack.sample(state, policy())
        reward = returntuple[0]
        state = returntuple[1]
        G += reward
    print "Episode: ", episodeNum, "Return: ", G
    returnSum = returnSum + G
print "Average return: ", returnSum / numEpisodes
Exemplo n.º 57
0
#---------------------------------------------------------------
# Course:           CMPUT 366
# Assignment:       Project1
# Due Date:         Nov 5, 2015
# Names:            Mujda Abbasi - Zainab Alsharif
# Student ID:         1298314         1223455
#---------------------------------------------------------------

import blackjack as bj
import numpy as np
from pylab import *

numEpisodes = 2000
returnSum = 0.0

for episodeNum in range(numEpisodes):
    G = 0
    s = bj.init()
    while s != -1:
        r, s = bj.sample(s,np.random.randint(2))
        G+=r
    
    print "Episode: ", episodeNum, "Return: ", G
    returnSum = returnSum + G
print "Average return: ", returnSum/numEpisodes