return valueF if __name__ == '__main__': grid = negativeGrid() valueF = {} policy = {} for s in grid.allStates(): valueF[s] = 0 # state -> action policy = { (2, 0): 'U', (1, 0): 'U', (0, 0): 'R', (0, 1): 'R', (0, 2): 'R', (1, 2): 'R', (2, 1): 'R', (2, 2): 'R', (2, 3): 'U', } statesRewardsList = [] printPolicy(policy, grid) for n in range(2000): valueF = playGame(grid, policy) print("final values:") printValues(valueF, grid) printPolicy(policy, grid) print("\n\n")
stateActionsReturnsList = [] printPolicy(policy, grid) for n in range(2000): #print(n) seenStates = set() stateActionsReturnsList = playGame(grid, policy) for s, a, r in stateActionsReturnsList: if (s, a) not in seenStates: #first visit policy seenStates.add((s, a)) if (s, a) in allReturns: allReturns[(s, a)].append(r) else: allReturns[(s, a)] = r valueF[s][a] = np.mean(allReturns[(s, a)]) #print (policy) #print(valueF) for s in states: #print (valueF[s]) #print (getMaximumFromDict(valueF[s])[0]) policy[s] = getMaximumFromDict(valueF[s])[0] V = {} for s in policy.keys(): V[s] = getMaximumFromDict(valueF[s])[1] print("final values:") printValues(V, grid) #print(valueF) printPolicy(policy, grid) print("\n\n")