def improvePolicy(policy,values, discountFactor): stable = True # all locations in the grid and all possible moves alllocations = [ (x,y) for x in range(11) for y in range(11)] moves = [(0,0),(0,1),(0,-1),(1,0),(-1,0)] # loop over possible states for predloc in alllocations: for preyloc in alllocations: if predloc == preyloc: continue prey = Prey(*preyloc) oldPolicy = policy[(predloc,preyloc)] bestPolicy = (0,0) bestVal = 0 # calculate greedy policy according to values for predMove in moves: newPredloc = ((predloc[0] + predMove[0])%11,(predloc[1] + predMove[1])%11) preySum = 0 if newPredloc == preyloc : preySum += 10.0 else: for preyProb, newPreyloc in prey.expand(newPredloc): preySum += preyProb * discountFactor * values[(newPredloc,newPreyloc)] if bestVal <= preySum: bestVal = preySum bestPolicy = predMove policy[(predloc,preyloc)]=bestPolicy # keep track of whether the policy is adjusted if oldPolicy != bestPolicy: stable = False return policy, stable
def valueIteration(discountFactor): # all locations in grid alllocations = [ (x,y) for x in range(11) for y in range(11)] # initialize values values = {} bestMoves = {} for predloc in alllocations: for preyloc in alllocations: if preyloc != predloc: values[(predloc,preyloc)] = 0 agent = Agent(0,0) deltas = [] epsilon = 0.01 delta = 1 numIt = 0 # perform value iteration according to pseud-code while delta > epsilon: delta = 0 newValues = {} # loop over all states for predloc in alllocations: for preyloc in alllocations: if predloc == preyloc: continue agent.setLocation(predloc) prey = Prey(*preyloc) temp = values[(predloc,preyloc)] # find optimal value according to current values bestVal = 0 bestMove = (0,0) for prob, predMove in agent.getMoveList(): preySum = 0 newPredloc = ((predloc[0] + predMove[0])%11,(predloc[1] + predMove[1])%11) if newPredloc == preyloc : preySum += 10.0 else: for preyProb, newPreyloc in prey.expand(newPredloc): preySum += preyProb * discountFactor * values[(newPredloc,newPreyloc)] if bestVal <= preySum: bestVal = preySum bestMove = predMove newValues[(predloc,preyloc)] = bestVal bestMoves[(predloc,preyloc)] = bestMove delta = max(delta, np.abs(bestVal - temp)) values = newValues deltas.append(delta) numIt+=1 # greedy policy to the optimal values computed above def policy(state): predloc, preyloc = state agent.setLocation(predloc) prey = Prey(*preyloc) return bestMoves[(predloc,preyloc)] return numIt, values, policy
def valueIteration(): alldiffs = [ (x,y) for x in range(-5,6) for y in range(-5,6)] alldiffs.remove((0,0)) # the relative positions vary from -5 up to 5, in both dimensions values = {} for x in range(-5,6): for y in range(-5,6): values[(x,y)] = 0 bestMoves = {} agent = Agent(0,0) deltas = [] discountFactor = 0.8 epsilon = 0.01 delta = 1 while delta > epsilon: delta = 0 newValues = {} for diff in alldiffs: # we place the predator in the middle of the world, # we are allowed to do this, since the positions are encoded relatively predloc = (5,5) preyloc = (predloc[0]+diff[0],predloc[1]+diff[1]) curKey = rewriteStates(predloc,preyloc) agent.setLocation(predloc) prey = Prey(*preyloc) temp = values[curKey] bestVal = 0 bestMove = (0,0) for prob, predMove in agent.getMoveList(): preySum = 0 newPredloc = agent.locAfterMove(predMove) if newPredloc == preyloc : preySum += 10.0 else: for preyProb, newPreyloc in prey.expand(newPredloc): # using rewriteStates we use relative positions preySum += preyProb * discountFactor * values[rewriteStates(newPredloc,newPreyloc)] if bestVal <= preySum: bestVal = preySum bestMove = predMove newValues[curKey] = bestVal bestMoves[curKey] = bestMove delta = max(delta, np.abs(bestVal - temp)) values = newValues deltas.append(delta) def policy(state): predloc, preyloc = state agent.setLocation(predloc) prey = Prey(*preyloc) return bestMoves[rewriteStates(predloc,preyloc)] return policy
def valueFunction(): # all locations on the grid alllocations = [ (x,y) for x in range(11) for y in range(11)] # initialize value function values = {} for predloc in alllocations: for preyloc in alllocations: if preyloc != predloc: values[(predloc,preyloc)] = 0 # predator which is placed in the top-left agent = Agent(0,0) discountFactor = 0.8 epsilon = 0.01 delta = 1 numIt = 0 while delta > epsilon: delta = 0 newValues = {} # sweep over all possible states for predloc in alllocations: for preyloc in alllocations: if predloc == preyloc: continue # place predator and prey at location agent.setLocation(predloc) prey = Prey(*preyloc) # temp is previous value of state temp = values[(predloc,preyloc)] moveSum = 0 # iterates over each actionthe agent can take # and the probability of the action according to the policy for prob, newPredloc in agent.expand(): preySum = 0 # absorbing state if newPredloc == preyloc : preySum += 10.0 else: # iterates over the states which the action can lead to, and their probability (stochastic) for preyProb, newPreyloc in prey.expand(newPredloc): # part of update rule (sum over s') preySum += preyProb * discountFactor * values[(newPredloc,newPreyloc)] # part of update rule (sum over a) moveSum += prob * preySum # policy evaluation update newValues[(predloc,preyloc)] = moveSum delta = max(delta, np.abs(moveSum - temp)) values = newValues numIt += 1 return values, numIt
def evaluatePolicy(policy, discountFactor, values=None): # all locations on the grid alllocations = [ (x,y) for x in range(11) for y in range(11)] # initialize values if None is given if values is None: values = {} for predloc in alllocations: for preyloc in alllocations: if preyloc != predloc: values[(predloc,preyloc)] = 0 delta = 1 numIt = 0 # perform update values according to given pseudo-code while delta > epsilon: delta = 0 newValues = {} # will be filled with new values # loop over all states for predloc in alllocations: for preyloc in alllocations: if predloc == preyloc: # impossible state continue prey = Prey(*preyloc) temp = values[(predloc,preyloc)] predMove = policy[(predloc,preyloc)] # make move according to policy newPredloc = ((predloc[0] + predMove[0])%11,(predloc[1] + predMove[1])%11) preySum = 0 # calculate discounted sum if newPredloc == preyloc : preySum += 10.0 # game ends after this else: for preyProb, newPreyloc in prey.expand(newPredloc): preySum += preyProb * discountFactor * values[(newPredloc,newPreyloc)] newValues[(predloc,preyloc)] = preySum delta = max(delta, np.abs(preySum - temp)) values = newValues numIt +=1 return values, numIt