return valueF


if __name__ == '__main__':

    grid = negativeGrid()
    valueF = {}
    policy = {}
    for s in grid.allStates():
        valueF[s] = 0
    # state -> action
    policy = {
        (2, 0): 'U',
        (1, 0): 'U',
        (0, 0): 'R',
        (0, 1): 'R',
        (0, 2): 'R',
        (1, 2): 'R',
        (2, 1): 'R',
        (2, 2): 'R',
        (2, 3): 'U',
    }
    statesRewardsList = []
    printPolicy(policy, grid)
    for n in range(2000):
        valueF = playGame(grid, policy)

    print("final values:")
    printValues(valueF, grid)
    printPolicy(policy, grid)
    print("\n\n")
    stateActionsReturnsList = []
    printPolicy(policy, grid)
    for n in range(2000):
        #print(n)
        seenStates = set()
        stateActionsReturnsList = playGame(grid, policy)
        for s, a, r in stateActionsReturnsList:
            if (s, a) not in seenStates:
                #first visit policy
                seenStates.add((s, a))
                if (s, a) in allReturns:
                    allReturns[(s, a)].append(r)
                else:
                    allReturns[(s, a)] = r
                valueF[s][a] = np.mean(allReturns[(s, a)])
        #print (policy)
        #print(valueF)
        for s in states:
            #print (valueF[s])
            #print (getMaximumFromDict(valueF[s])[0])
            policy[s] = getMaximumFromDict(valueF[s])[0]

    V = {}
    for s in policy.keys():
        V[s] = getMaximumFromDict(valueF[s])[1]

    print("final values:")
    printValues(V, grid)
    #print(valueF)
    printPolicy(policy, grid)
    print("\n\n")