Exemplo n.º 1
0
class TDL_solution:
    def __init__(self):
        self.game = GridWorld( (5,5))
        self.squareCountGrid = self.game.createSquareCount()
        self.alpha = 0.1
        self.gamma = 0.9
    
    def playTDLGame(self,startSquare, randomMove):
        self.game.currentSquare = startSquare
        
        keepPlaying = not self.game.gameOver()
        squares_and_returns = [(self.game.currentSquare,0)]
     
        while keepPlaying:
            
            #policy
            i = self.game.currentSquare[0]
            j = self.game.currentSquare[1]
            move = self.game.policyGrid[i][j]
      
            if randomMove < np.random.rand():
                moves = self.game.possibleMoves((i,j))
               
                moves.remove(move)
                if len(moves) > 0:
                    idx = np.random.randint(0,len(moves))
                    move = moves[idx]
            #move
            self.game.move(move)
            i = self.game.currentSquare[0]
            j = self.game.currentSquare[1]
            theReturn = self.game.returnGrid[i][j]
            squares_and_returns.append( (self.game.currentSquare,theReturn) )
            keepPlaying = not self.game.gameOver()
        
        G = 0
        self.squares_and_values = []
        for square , theReturn in reversed(squares_and_returns):
            self.squares_and_values.append( (square,G) )
            G = theReturn + self.game.gamma*G
        #self.squares_and_values.reverse()
    
    def playSarsa(self,startSquare, randomMove):
        self.game.currentSquare = startSquare
        keepPlaying = not self.game.gameOver()
        
        while keepPlaying:
            
            #policy
            i1 = self.game.currentSquare[0]
            j1 = self.game.currentSquare[1]
            move = self.game.policyGrid[i1][j1]
      
            if randomMove < np.random.rand():
                moves = self.game.possibleMoves((i1,j1))
                print( str(i1) + " " + str(j1) + " " + str(moves) + " " + str(move) )
                moves.remove(move)
                if len(moves) > 0:
                    idx = np.random.randint(0,len(moves))
                    move = moves[idx]
            #move
            self.game.move(move)
            i2 = self.game.currentSquare[0]
            j2 = self.game.currentSquare[1]
            theReturn = self.game.returnGrid[i2][j2]
            self.game.valueGrid[i1][j1] = self.game.valueGrid[i1][j1] + self.alpha*(theReturn + self.gamma*self.game.valueGrid[i2][j2]- self.game.valueGrid[i1][j1] )
            keepPlaying = not self.game.gameOver()
            
    def playQLearning(self,startSquare, randomMove):
        self.game.currentSquare = startSquare
        keepPlaying = not self.game.gameOver()
        
        while keepPlaying:
            
            #policy
            i1 = self.game.currentSquare[0]
            j1 = self.game.currentSquare[1]
            move = self.game.policyGrid[i1][j1]
            
            # we use the best move even if random runs over it
            i3 = self.game.currentSquare[0]
            j3 = self.game.currentSquare[1]
      
            if randomMove < np.random.rand():
                moves = self.game.possibleMoves((i1,j1))
                print( str(i1) + " " + str(j1) + " " + str(moves) + " " + str(move) )
                moves.remove(move)
                if len(moves) > 0:
                    idx = np.random.randint(0,len(moves))
                    move = moves[idx]
            #move
            self.game.move(move)
            i2 = self.game.currentSquare[0]
            j2 = self.game.currentSquare[1]
            theReturn = self.game.returnGrid[i2][j2]
            self.game.valueGrid[i1][j1] = self.game.valueGrid[i1][j1] + self.alpha*(theReturn + self.gamma*self.game.valueGrid[i3][j3]- self.game.valueGrid[i1][j1] )
            keepPlaying = not self.game.gameOver()
    
        
        
    def updateValueGrid(self):
        for t in range(len(self.squares_and_values) -1):
            
            square , _ = self.squares_and_values[t]
            nextSquare, value = self.squares_and_values[t+1]
            i1 = square[0]
            j1 = square[1]
            i2 = nextSquare[0]
            j2 = nextSquare[1]
            self.game.valueGrid[i1][j1] = self.game.valueGrid[i1][j1] + self.alpha*(value + self.gamma*self.game.valueGrid[i2][j2]- self.game.valueGrid[i1][j1] )  
    
    def updatePolicyGrid(self):
        
        #check if policy change
        #hasChanged = False
        #if bestMove is new set to true.
        rows = self.game.size[0]
        cols = self.game.size[1]
        change = False
        for i in range(rows):
            for j in range(cols):
                if self.game.policyGrid[i][j] in [0,1,2,3]:
                    self.game.currentSquare = (i,j)
                    oldMove = self.game.policyGrid[i][j]
                    self.game.policyGrid[i][j] = self.game.bestMove()
                    if oldMove != self.game.policyGrid[i][j]:
                        change = True
        return change
        
        
    def printGrids(self):
        self.game.printPolicyGrid()
        self.game.printReturnGrid()
        self.game.printValueGrid()
class DP_Solution:
    def __init__(self, gamma, lower_limit):
        self.game = GridWorld((5, 5))
        self.gamma = gamma
        self.lower_limit = lower_limit

    def updateValueGrid(self):
        rows = self.game.size[0]
        cols = self.game.size[1]
        for i in range(rows):
            for j in range(cols):
                move = self.game.policyGrid[i][j]
                #print(str(i)+" " + str(j) +" " +str(move))
                if move in [0, 1, 2, 3]:
                    if move == 0:
                        theReturn = self.game.returnGrid[i - 1][j]
                        self.game.valueGrid[i][j] = self.gamma * (
                            theReturn + self.game.valueGrid[i - 1][j])
                    if move == 1:
                        theReturn = self.game.returnGrid[i][j + 1]
                        self.game.valueGrid[i][j] = self.gamma * (
                            theReturn + self.game.valueGrid[i][j + 1])
                    if move == 2:
                        theReturn = self.game.returnGrid[i + 1][j]
                        self.game.valueGrid[i][j] = self.gamma * (
                            theReturn + self.game.valueGrid[i + 1][j])
                    if move == 3:
                        theReturn = self.game.returnGrid[i][j - 1]
                        self.game.valueGrid[i][j] = self.gamma * (
                            theReturn + self.game.valueGrid[i][j - 1])

    def updateValueGridWindy(self, sucessRate=0.75):
        rows = self.game.size[0]
        cols = self.game.size[1]

        for i in range(rows):
            for j in range(cols):
                possibleMoves = self.game.possibleMoves((i, j))
                nrOfWrongMoves = len(possibleMoves) - 1
                chosenMove = self.game.policyGrid[i][j]
                if not self.game.policyGrid[i][j] in [-1, 9]:
                    self.game.valueGrid[i][j] = 0
                    for move in possibleMoves:
                        if move == chosenMove:
                            p = sucessRate
                        else:
                            if nrOfWrongMoves != 0:
                                p = (1 - sucessRate) / nrOfWrongMoves
                            else:
                                p = 0  # shouldnt happen
                        if move == 0:
                            theReturn = self.game.returnGrid[i - 1][j]
                            self.game.valueGrid[i][j] += p * self.gamma * (
                                theReturn + self.game.valueGrid[i - 1][j])
                        if move == 1:
                            theReturn = self.game.returnGrid[i][j + 1]
                            self.game.valueGrid[i][j] += p * self.gamma * (
                                theReturn + self.game.valueGrid[i][j + 1])
                        if move == 2:
                            theReturn = self.game.returnGrid[i + 1][j]
                            self.game.valueGrid[i][j] += p * self.gamma * (
                                theReturn + self.game.valueGrid[i + 1][j])
                        if move == 3:
                            theReturn = self.game.returnGrid[i][j - 1]
                            self.game.valueGrid[i][j] += p * self.gamma * (
                                theReturn + self.game.valueGrid[i][j - 1])

    def updatePolicyGrid(self):

        #check if policy change
        #hasChanged = False
        #if bestMove is new set to true.
        rows = self.game.size[0]
        cols = self.game.size[1]
        change = False
        for i in range(rows):
            for j in range(cols):
                if self.game.policyGrid[i][j] in [0, 1, 2, 3]:
                    self.game.currentSquare = (i, j)
                    oldMove = self.game.policyGrid[i][j]
                    self.game.policyGrid[i][j] = self.game.bestMove()
                    if oldMove != self.game.policyGrid[i][j]:
                        change = True
        return change

    def updatePolicyGridWindy(self):

        #check if policy change
        #hasChanged = False
        #if bestMove is new set to true.
        rows = self.game.size[0]
        cols = self.game.size[1]
        change = False
        for i in range(rows):
            for j in range(cols):
                if self.game.policyGrid[i][j] in [0, 1, 2, 3]:
                    self.game.currentSquare = (i, j)
                    oldMove = self.game.policyGrid[i][j]
                    self.game.policyGrid[i][j] = self.game.bestMove()
                    if oldMove != self.game.policyGrid[i][j]:
                        change = True
        return change

    def updateUntilConvergence(self):
        change = True
        count = 0

        while change:
            change = self.updatePolicyGrid()
            self.updateValueGridWindy()
            count += 1
            if count % 1000 == 0:
                print("count: " + str(count))
            if count > 10000:
                print("didnt converge")
                break

    def printGrids(self):
        self.game.printPolicyGrid()
        self.game.printReturnGrid()
        self.game.printValueGrid()
Exemplo n.º 3
0
class MC_solution:
    def __init__(self):
        self.game = GridWorld((5, 5))
        self.squareCountGrid = self.game.createSquareCount()

    def playMCGame(self, startSquare, randomMove):
        self.game.currentSquare = startSquare

        keepPlaying = not self.game.gameOver()
        squares_and_returns = [(self.game.currentSquare, 0)]

        while keepPlaying:

            #policy
            i = self.game.currentSquare[0]
            j = self.game.currentSquare[1]
            move = self.game.policyGrid[i][j]

            if randomMove < np.random.rand():
                moves = self.game.possibleMoves((i, j))

                moves.remove(move)
                if len(moves) > 0:
                    idx = np.random.randint(0, len(moves))
                    move = moves[idx]
            #move
            self.game.move(move)
            i = self.game.currentSquare[0]
            j = self.game.currentSquare[1]
            theReturn = self.game.returnGrid[i][j]
            squares_and_returns.append((self.game.currentSquare, theReturn))
            keepPlaying = not self.game.gameOver()

        G = 0
        self.squares_and_values = []
        for square, theReturn in reversed(squares_and_returns):
            self.squares_and_values.append((square, G))
            G = theReturn + self.game.gamma * G
        #self.squares_and_values.reverse()

    def updateValueGrid(self):
        visitedSquares = set()

        for square, G in self.squares_and_values:
            #print(square)
            if not square in visitedSquares:
                visitedSquares.add(square)
                i = square[0]
                j = square[1]
                self.squareCountGrid[i][j] += 1
                self.game.valueGrid[i][j] = self.game.valueGrid[i][j] + (
                    G - self.game.valueGrid[i][j]) / self.squareCountGrid[i][j]

    def updatePolicyGrid(self):

        #check if policy change
        #hasChanged = False
        #if bestMove is new set to true.
        rows = self.game.size[0]
        cols = self.game.size[1]
        change = False
        for i in range(rows):
            for j in range(cols):
                if self.game.policyGrid[i][j] in [0, 1, 2, 3]:
                    self.game.currentSquare = (i, j)
                    oldMove = self.game.policyGrid[i][j]
                    self.game.policyGrid[i][j] = self.game.bestMove()
                    if oldMove != self.game.policyGrid[i][j]:
                        change = True
        return change

    def printGrids(self):
        self.game.printPolicyGrid()
        self.game.printReturnGrid()
        self.game.printValueGrid()
        print(self.squareCountGrid)
Exemplo n.º 4
0
class MC_Aprox_Solution:
    def __init__(self):
        self.game = GridWorld((5, 5))
        self.learning_rate = 0.001
        self.theta = np.random.randn(4) / 2

    def s2x(self, square):
        return np.array(
            [square[0] - 1, square[1] - 1.5, square[0] * square[1] - 3, 1])

    def playMCGame(self, startSquare, randomMove):
        self.game.currentSquare = startSquare

        keepPlaying = not self.game.gameOver()
        squares_and_returns = [(self.game.currentSquare, 0)]
        counter = 0
        while keepPlaying:

            counter += 1
            if counter > 2000:
                return False

            #policy
            i = self.game.currentSquare[0]
            j = self.game.currentSquare[1]
            move = self.game.policyGrid[i][j]

            if randomMove < np.random.rand():
                moves = self.game.possibleMoves((i, j))

                moves.remove(move)
                if len(moves) > 0:
                    idx = np.random.randint(0, len(moves))
                    move = moves[idx]
            #move
            self.game.move(move)
            i = self.game.currentSquare[0]
            j = self.game.currentSquare[1]
            theReturn = self.game.returnGrid[i][j]
            squares_and_returns.append((self.game.currentSquare, theReturn))
            keepPlaying = not self.game.gameOver()

        G = 0
        self.squares_and_values = []
        for square, theReturn in reversed(squares_and_returns):
            self.squares_and_values.append((square, G))
            G = theReturn + self.game.gamma * G

        return True

    def updateValueGrid(self, t):
        visitedSquares = set()

        alpha = self.learning_rate / (t + 1)
        for square, G in self.squares_and_values:
            #print(square)
            if not square in visitedSquares:
                visitedSquares.add(square)

                old_theta = self.theta.copy()
                x = self.s2x(square)
                V_hat = theta.dot(x)

                self.theta += alpha * (G - V_hat) * x

        rows = self.game.size[0]
        cols = self.game.size[1]
        for i in range(rows):
            for j in range(cols):
                if self.game.policyGrid[i][j] in [0, 1, 2, 3]:
                    self.game.valueGrid[i][j] = self.theta.dot(self.s2x(
                        (i, j)))

    def updatePolicyGrid(self):

        #check if policy change
        #hasChanged = False
        #if bestMove is new set to true.
        rows = self.game.size[0]
        cols = self.game.size[1]
        change = False
        for i in range(rows):
            for j in range(cols):
                if self.game.policyGrid[i][j] in [0, 1, 2, 3]:
                    self.game.currentSquare = (i, j)
                    oldMove = self.game.policyGrid[i][j]
                    self.game.policyGrid[i][j] = self.game.bestMove()
                    if oldMove != self.game.policyGrid[i][j]:
                        change = True
        return change

    def printGrids(self):
        self.game.printPolicyGrid()
        self.game.printReturnGrid()
        self.game.printValueGrid()